diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c36eaadfb..1777489ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,6 +23,9 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 jobs: macOS-latest-cmake-arm64: @@ -375,7 +378,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -401,7 +404,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: add oneAPI to apt shell: bash @@ -442,7 +445,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: add oneAPI to apt shell: bash @@ -546,7 +549,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -576,7 +579,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -610,7 +613,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -969,14 +972,14 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install id: depends run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait write-host "Completed AMD HIP SDK installation" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 99feb28f2..699ac095d 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -20,6 +20,12 @@ on: types: [opened, synchronize, reopened] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] +env: + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_LOG_VERBOSITY: 10 + concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true @@ -173,6 +179,7 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests + $env:PYTHONIOENCODING = ":replace" behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests diff --git a/CMakeLists.txt b/CMakeLists.txt index a31320635..973907819 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,11 +82,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE ON) + set(GGML_LLAMAFILE_DEFAULT ON) endif() -if (NOT DEFINED GGML_CUDA_USE_GRAPHS) - set(GGML_CUDA_USE_GRAPHS ON) +if (NOT DEFINED GGML_CUDA_GRAPHS) + set(GGML_CUDA_GRAPHS_DEFAULT ON) endif() # transition helpers @@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # +set(GGML_TRANSIENT_DEFINES) get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) +if (GGML_DIR_DEFINES) + list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES}) +endif() get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) -set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) +if (GGML_TARGET_DEFINES) + list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) +endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) diff --git a/Makefile b/Makefile index 6053bc17b..f922f7083 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,7 @@ TEST_TARGETS = \ tests/test-grammar-parser \ tests/test-json-schema-to-grammar \ tests/test-llama-grammar \ + tests/test-log \ tests/test-model-load-cancel \ tests/test-opt \ tests/test-quantize-fns \ @@ -148,6 +149,14 @@ GGML_NO_METAL := 1 DEPRECATE_WARNING := 1 endif +ifdef LLAMA_DISABLE_LOGS +REMOVE_WARNING := 1 +endif + +ifdef LLAMA_SERVER_VERBOSE +REMOVE_WARNING := 1 +endif + ifndef UNAME_S UNAME_S := $(shell uname -s) endif @@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED MK_LDFLAGS += -fsanitize=undefined -g endif -ifdef LLAMA_SERVER_VERBOSE - MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) -endif - ifdef LLAMA_SERVER_SSL MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_LDFLAGS += -lssl -lcrypto endif -ifdef LLAMA_DISABLE_LOGS - MK_CPPFLAGS += -DLOG_DISABLE_LOGS -endif # LLAMA_DISABLE_LOGS - # warnings WARN_FLAGS = \ -Wall \ @@ -434,7 +435,7 @@ endif # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue -ifndef RISCV +ifndef RISCV_CROSS_COMPILE ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: @@ -514,7 +515,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),) MK_CXXFLAGS += -mlasx endif -else +ifneq ($(filter riscv64%,$(UNAME_M)),) + MK_CFLAGS += -march=rv64gcv -mabi=lp64d + MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d +endif + +else # RISC-V CROSS COMPILATION MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif @@ -613,7 +619,7 @@ ifdef GGML_CUDA CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS + MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_NVCCFLAGS += -use_fast_math endif # GGML_MUSA @@ -925,6 +931,8 @@ OBJ_LLAMA = \ OBJ_COMMON = \ common/common.o \ + common/arg.o \ + common/log.o \ common/console.o \ common/ngram-cache.o \ common/sampling.o \ @@ -1021,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE) $(info ) endif +ifdef REMOVE_WARNING +$(info !!! REMOVAL WARNING !!!) +$(info The following LLAMA_ options have been removed and are no longer supported) +$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info ) +endif + # # Build libraries # @@ -1157,6 +1173,16 @@ common/common.o: \ include/llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ +common/arg.o: \ + common/arg.cpp \ + common/arg.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +common/log.o: \ + common/log.cpp \ + common/log.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common/sampling.o: \ common/sampling.cpp \ common/sampling.h \ @@ -1335,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ - $(OBJ_GGML) $(OBJ_LLAMA) + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1429,6 +1455,7 @@ llama-server: \ examples/server/system-prompts.js.hpp \ examples/server/prompt-formats.js.hpp \ examples/server/json-schema-to-grammar.mjs.hpp \ + examples/server/loading.html.hpp \ common/json.hpp \ common/stb_image.h \ $(OBJ_ALL) @@ -1448,7 +1475,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - ./llama-gen-docs libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ @@ -1517,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +tests/test-log: tests/test-log.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-grammar-parser: tests/test-grammar-parser.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/README.md b/README.md index e30ab0c8c..4d24dd591 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics -- *add hot topics here* +- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- @@ -77,6 +77,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) @@ -89,6 +90,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) +- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) @@ -163,6 +165,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) - [AIKit](https://github.com/sozercan/aikit) (MIT) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) +- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* @@ -171,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage +- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) **Infrastructure:** diff --git a/ci/run.sh b/ci/run.sh index 751bb0a02..1ac08ee4e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -737,6 +737,9 @@ function gg_sum_embd_bge_small { ## main +export LLAMA_LOG_PREFIX=1 +export LLAMA_LOG_TIMESTAMPS=1 + if [ -z ${GG_BUILD_LOW_PERF} ]; then # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 2c72793b8..042e895ad 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,19 +51,23 @@ endif() set(TARGET common) add_library(${TARGET} STATIC + arg.cpp + arg.h base64.hpp - common.h common.cpp - sampling.h - sampling.cpp - console.h + common.h console.cpp - json.hpp + console.h json-schema-to-grammar.cpp - train.h - train.cpp - ngram-cache.h + json.hpp + log.cpp + log.h ngram-cache.cpp + ngram-cache.h + sampling.cpp + sampling.h + train.cpp + train.h ) if (BUILD_SHARED_LIBS) diff --git a/common/arg.cpp b/common/arg.cpp new file mode 100644 index 000000000..60e37a89a --- /dev/null +++ b/common/arg.cpp @@ -0,0 +1,1994 @@ +#include "arg.h" + +#include "log.h" +#include "sampling.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json-schema-to-grammar.h" + +using json = nlohmann::ordered_json; + +llama_arg & llama_arg::set_examples(std::initializer_list examples) { + this->examples = std::move(examples); + return *this; +} + +llama_arg & llama_arg::set_env(const char * env) { + help = help + "\n(env: " + env + ")"; + this->env = env; + return *this; +} + +llama_arg & llama_arg::set_sparam() { + is_sparam = true; + return *this; +} + +bool llama_arg::in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); +} + +bool llama_arg::get_value_from_env(std::string & output) { + if (env == nullptr) return false; + char * value = std::getenv(env); + if (value) { + output = value; + return true; + } + return false; +} + +bool llama_arg::has_value_from_env() { + return env != nullptr && std::getenv(env); +} + +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); + } else { + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); + } + }; + while (std::getline(iss, line)) { + add_line(line); + } + return result; +} + +std::string llama_arg::to_string() { + // params for printing to console + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + std::string leading_spaces(n_leading_spaces, ' '); + + std::ostringstream ss; + for (const auto arg : args) { + if (arg == args.front()) { + if (args.size() == 1) { + ss << arg; + } else { + // first arg is usually abbreviation, we need padding to make it more beautiful + auto tmp = std::string(arg) + ", "; + auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' '); + ss << tmp << spaces; + } + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + return ss.str(); +} + +// +// utils +// + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +static void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + params.model = fs_get_cache_file(string_split(f, '/').back()); + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + +// +// CLI argument parsing functions +// + +static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { + std::string arg; + const std::string arg_prefix = "--"; + gpt_params & params = ctx_arg.params; + + std::unordered_map arg_to_options; + for (auto & opt : ctx_arg.options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // handle environment variables + for (auto & opt : ctx_arg.options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(params); + } + if (opt.handler_int) { + opt.handler_int(params, std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(params, value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); + } + } + } + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + + for (int i = 1; i < argc; i++) { + const std::string arg_prefix = "--"; + + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + } + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + } + } + + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + return true; +} + +static void gpt_params_print_usage(gpt_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (llama_arg * opt : options) { + printf("%s", opt->to_string().c_str()); + } + }; + + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + for (auto & opt : ctx_arg.options) { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + printf("----- common params -----\n\n"); + print_options(common_options); + printf("\n\n----- sampling params -----\n\n"); + print_options(sparam_options); + // TODO: maybe convert enum llama_example to string + printf("\n\n----- example-specific params -----\n\n"); + print_options(specific_options); +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); + const gpt_params params_org = ctx_arg.params; // the example can modify the default params + + try { + if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + ctx_arg.params = params_org; + return false; + } + if (ctx_arg.params.usage) { + gpt_params_print_usage(ctx_arg); + if (ctx_arg.print_usage) { + ctx_arg.print_usage(argc, argv); + } + exit(0); + } + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + ctx_arg.params = params_org; + return false; + } + + return true; +} + +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + gpt_params_context ctx_arg(params); + ctx_arg.print_usage = print_usage; + ctx_arg.ex = ex; + + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto & sampler : params.sparams.samplers) { + sampler_type_chars += gpt_sampler_type_to_chr(sampler); + sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + } + sampler_type_names.pop_back(); + + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + auto add_opt = [&](llama_arg arg) { + if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + ctx_arg.options.push_back(std::move(arg)); + } + }; + + + add_opt(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [](gpt_params & params) { + params.usage = true; + } + )); + add_opt(llama_arg( + {"--version"}, + "show version and build info", + [](gpt_params &) { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + )); + add_opt(llama_arg( + {"--verbose-prompt"}, + format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](gpt_params & params) { + params.verbose_prompt = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-display-prompt"}, + format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](gpt_params & params) { + params.display_prompt = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-co", "--color"}, + format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](gpt_params & params) { + params.use_color = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-t", "--threads"}, "N", + format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](gpt_params & params, int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_env("LLAMA_ARG_THREADS")); + add_opt(llama_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [](gpt_params & params, int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + )); + add_opt(llama_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](gpt_params & params, int value) { + params.draft_cpuparams.n_threads = value; + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.n_threads = value; + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [](gpt_params & params, const std::string & mask) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [](gpt_params & params, const std::string & range) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict"}, "<0|1>", + format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](gpt_params & params, const std::string & value) { + params.cpuparams.strict_cpu = std::stoul(value); + } + )); + add_opt(llama_arg( + {"--prio"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll"}, "<0...100>", + format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](gpt_params & params, const std::string & value) { + params.cpuparams.poll = std::stoul(value); + } + )); + add_opt(llama_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [](gpt_params & params, const std::string & range) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(llama_arg( + {"--prio-batch"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll)", + [](gpt_params & params, int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(llama_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.draft_cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](gpt_params & params, int value) { + params.draft_cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-batch-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](gpt_params & params, int value) { + params.n_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-ps", "--p-split"}, "N", + format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](gpt_params & params, const std::string & value) { + params.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_static = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_dynamic = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-c", "--ctx-size"}, "N", + format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](gpt_params & params, int value) { + params.n_ctx = value; + } + ).set_env("LLAMA_ARG_CTX_SIZE")); + add_opt(llama_arg( + {"-n", "--predict", "--n-predict"}, "N", + format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](gpt_params & params, int value) { + params.n_predict = value; + } + ).set_env("LLAMA_ARG_N_PREDICT")); + add_opt(llama_arg( + {"-b", "--batch-size"}, "N", + format("logical maximum batch size (default: %d)", params.n_batch), + [](gpt_params & params, int value) { + params.n_batch = value; + } + ).set_env("LLAMA_ARG_BATCH")); + add_opt(llama_arg( + {"-ub", "--ubatch-size"}, "N", + format("physical maximum batch size (default: %d)", params.n_ubatch), + [](gpt_params & params, int value) { + params.n_ubatch = value; + } + ).set_env("LLAMA_ARG_UBATCH")); + add_opt(llama_arg( + {"--keep"}, "N", + format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](gpt_params & params, int value) { + params.n_keep = value; + } + )); + add_opt(llama_arg( + {"--no-context-shift"}, + format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](gpt_params & params) { + params.ctx_shift = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--chunks"}, "N", + format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](gpt_params & params, int value) { + params.n_chunks = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"-fa", "--flash-attn"}, + format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](gpt_params & params) { + params.flash_attn = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(llama_arg( + {"-p", "--prompt"}, "PROMPT", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", + [](gpt_params & params, const std::string & value) { + params.prompt = value; + } + )); + add_opt(llama_arg( + {"--no-perf"}, + format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](gpt_params & params) { + params.no_perf = true; + params.sparams.no_perf = true; + } + ).set_env("LLAMA_ARG_NO_PERF")); + add_opt(llama_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + )); + add_opt(llama_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + )); + add_opt(llama_arg( + {"-e", "--escape"}, + format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](gpt_params & params) { + params.escape = true; + } + )); + add_opt(llama_arg( + {"--no-escape"}, + "do not process escape sequences", + [](gpt_params & params) { + params.escape = false; + } + )); + add_opt(llama_arg( + {"-ptc", "--print-token-count"}, "N", + format("print token count every N tokens (default: %d)", params.n_print), + [](gpt_params & params, int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [](gpt_params & params, const std::string & value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [](gpt_params & params) { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [](gpt_params & params) { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [](gpt_params & params, const std::string & value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-sp", "--special"}, + format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](gpt_params & params) { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-cnv", "--conversation"}, + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), + [](gpt_params & params) { + params.conversation = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-i", "--interactive"}, + format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](gpt_params & params) { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-if", "--interactive-first"}, + format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](gpt_params & params) { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [](gpt_params & params) { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [](gpt_params & params) { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [](gpt_params & params) { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--spm-infill"}, + format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [](gpt_params & params) { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--samplers"}, "SAMPLERS", + format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](gpt_params & params, const std::string & value) { + const auto sampler_names = string_split(value, ';'); + params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + } + ).set_sparam()); + add_opt(llama_arg( + {"-s", "--seed"}, "SEED", + format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED), + [](gpt_params & params, const std::string & value) { + params.sparams.seed = std::stoul(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--sampling-seq"}, "SEQUENCE", + format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.samplers = gpt_sampler_types_from_chars(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [](gpt_params & params) { + params.sparams.ignore_eos = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--penalize-nl"}, + format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](gpt_params & params) { + params.sparams.penalize_nl = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--temp"}, "N", + format("temperature (default: %.1f)", (double)params.sparams.temp), + [](gpt_params & params, const std::string & value) { + params.sparams.temp = std::stof(value); + params.sparams.temp = std::max(params.sparams.temp, 0.0f); + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-k"}, "N", + format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](gpt_params & params, int value) { + params.sparams.top_k = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-p"}, "N", + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](gpt_params & params, const std::string & value) { + params.sparams.top_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--min-p"}, "N", + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](gpt_params & params, const std::string & value) { + params.sparams.min_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--tfs"}, "N", + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](gpt_params & params, const std::string & value) { + params.sparams.tfs_z = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--typical"}, "N", + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](gpt_params & params, const std::string & value) { + params.sparams.typ_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-last-n"}, "N", + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](gpt_params & params, int value) { + params.sparams.penalty_last_n = value; + params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-penalty"}, "N", + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_repeat = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--presence-penalty"}, "N", + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_present = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--frequency-penalty"}, "N", + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_freq = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-range"}, "N", + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_range = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-exp"}, "N", + format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_exponent = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat"}, "N", + format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + [](gpt_params & params, int value) { + params.sparams.mirostat = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-lr"}, "N", + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_eta = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-ent"}, "N", + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_tau = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [](gpt_params & params, const std::string & value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sparams.logit_bias.push_back({key, bias}); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar"}, "GRAMMAR", + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(params.sparams.grammar) + ); + } + ).set_sparam()); + add_opt(llama_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + } + ).set_sparam()); + add_opt(llama_arg( + {"--pooling"}, "{none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--attention"}, "{causal,non,causal}", + "attention type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [](gpt_params & params, const std::string & value) { + params.rope_freq_base = std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-orig-ctx"}, "N", + format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](gpt_params & params, int value) { + params.yarn_orig_ctx = value; + } + )); + add_opt(llama_arg( + {"--yarn-ext-factor"}, "N", + format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_ext_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-attn-factor"}, "N", + format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_attn_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-slow"}, "N", + format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_slow = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-fast"}, "N", + format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_fast = std::stof(value); + } + )); + add_opt(llama_arg( + {"-gan", "--grp-attn-n"}, "N", + format("group-attention factor (default: %d)", params.grp_attn_n), + [](gpt_params & params, int value) { + params.grp_attn_n = value; + } + )); + add_opt(llama_arg( + {"-gaw", "--grp-attn-w"}, "N", + format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + [](gpt_params & params, int value) { + params.grp_attn_w = value; + } + )); + add_opt(llama_arg( + {"-dkvc", "--dump-kv-cache"}, + "verbose print of the KV cache", + [](gpt_params & params) { + params.dump_kv_cache = true; + } + )); + add_opt(llama_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [](gpt_params & params) { + params.no_kv_offload = true; + } + )); + add_opt(llama_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_k = value; + } + )); + add_opt(llama_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_v = value; + } + )); + add_opt(llama_arg( + {"--perplexity", "--all-logits"}, + format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](gpt_params & params) { + params.logits_all = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag-tasks"}, "N", + format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](gpt_params & params, int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande-tasks"}, "N", + format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](gpt_params & params, int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice-tasks"}, "N", + format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](gpt_params & params, int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [](gpt_params & params) { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--save-all-logits", "--kl-divergence-base"}, "FNAME", + "set logits file", + [](gpt_params & params, const std::string & value) { + params.logits_file = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-stride"}, "N", + format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](gpt_params & params, int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-output-type"}, "<0|1>", + format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](gpt_params & params, int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"-dt", "--defrag-thold"}, "N", + format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](gpt_params & params, const std::string & value) { + params.defrag_thold = std::stof(value); + } + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + add_opt(llama_arg( + {"-np", "--parallel"}, "N", + format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](gpt_params & params, int value) { + params.n_parallel = value; + } + )); + add_opt(llama_arg( + {"-ns", "--sequences"}, "N", + format("number of sequences to decode (default: %d)", params.n_sequences), + [](gpt_params & params, int value) { + params.n_sequences = value; + } + ).set_examples({LLAMA_EXAMPLE_PARALLEL})); + add_opt(llama_arg( + {"-cb", "--cont-batching"}, + format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](gpt_params & params) { + params.cont_batching = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); + add_opt(llama_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [](gpt_params & params) { + params.cont_batching = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + add_opt(llama_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + [](gpt_params & params, const std::string & value) { + params.mmproj = value; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(llama_arg( + {"--image"}, "FILE", + "path to an image file. use with multimodal models. Specify multiple times for batching", + [](gpt_params & params, const std::string & value) { + params.image.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); +#ifdef GGML_USE_RPC + add_opt(llama_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](gpt_params & params, const std::string & value) { + params.rpc_servers = value; + } + )); +#endif + add_opt(llama_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [](gpt_params & params) { + params.use_mlock = true; + } + )); + add_opt(llama_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [](gpt_params & params) { + params.use_mmap = false; + } + )); + add_opt(llama_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", + "number of layers to store in VRAM", + [](gpt_params & params, int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(llama_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](gpt_params & params, int value) { + params.n_gpu_layers_draft = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-sm", "--split-mode"}, "{none,layer,row}", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } else { + throw std::invalid_argument("invalid value"); + } + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); + } + } + )); + add_opt(llama_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); + } + } + )); + add_opt(llama_arg( + {"-mg", "--main-gpu"}, "INDEX", + format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](gpt_params & params, int value) { + params.main_gpu = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); + } + } + )); + add_opt(llama_arg( + {"--check-tensors"}, + format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](gpt_params & params) { + params.check_tensors = true; + } + )); + add_opt(llama_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [](gpt_params & params, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(llama_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & value) { + params.lora_adapters.push_back({ std::string(value), 1.0 }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.lora_adapters.push_back({ fname, std::stof(scale) }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [](gpt_params & params, const std::string & value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(llama_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(llama_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [](gpt_params & params, const std::string & start, const std::string & end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(llama_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [](gpt_params & params, const std::string & value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [](gpt_params & params, const std::string & value) { + params.model = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + add_opt(llama_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_url = value; + } + ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(llama_arg( + {"-hfr", "--hf-repo"}, "REPO", + "Hugging Face model repository (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO")); + add_opt(llama_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(llama_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [](gpt_params & params, const std::string & value) { + params.hf_token = value; + } + ).set_env("HF_TOKEN")); + add_opt(llama_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-size"}, "N", + format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](gpt_params & params, int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-separator"}, "STRING", + format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](gpt_params & params, const std::string & value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--junk"}, "N", + format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](gpt_params & params, int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"--pos"}, "N", + format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](gpt_params & params, int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"-o", "--output", "--output-file"}, "FNAME", + format("output file (default: '%s')", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? params.lora_outfile.c_str() + : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + ? params.cvector_outfile.c_str() + : params.out_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.out_file = value; + params.cvector_outfile = value; + params.lora_outfile = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-ofreq", "--output-frequency"}, "N", + format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](gpt_params & params, int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--save-frequency"}, "N", + format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](gpt_params & params, int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--process-output"}, + format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](gpt_params & params) { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--no-ppl"}, + format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](gpt_params & params) { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--chunk", "--from-chunk"}, "N", + format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](gpt_params & params, int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-pps"}, + format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](gpt_params & params) { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--embd-normalize"}, "N", + format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](gpt_params & params, int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [](gpt_params & params, const std::string & value) { + params.embd_out = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-separator"}, "STRING", + "separator of embendings (default \\n) for example \"<#sep#>\"", + [](gpt_params & params, const std::string & value) { + params.embd_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--host"}, "HOST", + format("ip address to listen (default: %s)", params.hostname.c_str()), + [](gpt_params & params, const std::string & value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + add_opt(llama_arg( + {"--port"}, "PORT", + format("port to listen (default: %d)", params.port), + [](gpt_params & params, int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + add_opt(llama_arg( + {"--path"}, "PATH", + format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](gpt_params & params, const std::string & value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--embedding", "--embeddings"}, + format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](gpt_params & params) { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + add_opt(llama_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [](gpt_params & params, const std::string & value) { + params.api_keys.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + add_opt(llama_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [](gpt_params & params, const std::string & value) { + params.ssl_file_key = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [](gpt_params & params, const std::string & value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-to", "--timeout"}, "N", + format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](gpt_params & params, int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--threads-http"}, "N", + format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](gpt_params & params, int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(llama_arg( + {"-spf", "--system-prompt-file"}, "FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--metrics"}, + format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--no-slots"}, + format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [](gpt_params & params, const std::string & value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + [](gpt_params & params, const std::string & value) { + if (!llama_chat_verify_template(value)) { + throw std::runtime_error(format( + "error: the supplied chat template is not supported: %s\n" + "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + value.c_str() + )); + } + params.chat_template = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + add_opt(llama_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](gpt_params & params, const std::string & value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--lora-init-without-apply"}, + format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](gpt_params & params) { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [](gpt_params & params) { + params.simple_io = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-ld", "--logdir"}, "LOGDIR", + "path under which to save YAML logs (no logging if unset)", + [](gpt_params & params, const std::string & value) { + params.logdir = value; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + } + )); + add_opt(llama_arg( + {"--positive-file"}, "FNAME", + format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_positive_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--negative-file"}, "FNAME", + format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-batch"}, "N", + format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](gpt_params & params, int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-iter"}, "N", + format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](gpt_params & params, int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--output-format"}, "{md,jsonl}", + "output format for batched-bench results (default: md)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } + else if (value == "md") { params.batched_bench_output_jsonl = false; } + else { std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--log-disable"}, + "Log disable", + [](gpt_params &) { + gpt_log_pause(gpt_log_main()); + } + )); + add_opt(llama_arg( + {"--log-file"}, "FNAME", + "Log to file", + [](gpt_params &, const std::string & value) { + gpt_log_set_file(gpt_log_main(), value.c_str()); + } + )); + add_opt(llama_arg( + {"--log-colors"}, + "Enable colored logging", + [](gpt_params &) { + gpt_log_set_colors(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_COLORS")); + add_opt(llama_arg( + {"-v", "--verbose", "--log-verbose"}, + "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", + [](gpt_params & params) { + params.verbosity = INT_MAX; + gpt_log_set_verbosity_thold(INT_MAX); + } + )); + add_opt(llama_arg( + {"-lv", "--verbosity", "--log-verbosity"}, "N", + "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", + [](gpt_params & params, int value) { + params.verbosity = value; + gpt_log_set_verbosity_thold(value); + } + ).set_env("LLAMA_LOG_VERBOSITY")); + add_opt(llama_arg( + {"--log-prefix"}, + "Enable prefx in log messages", + [](gpt_params &) { + gpt_log_set_prefix(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_PREFIX")); + add_opt(llama_arg( + {"--log-timestamps"}, + "Enable timestamps in log messages", + [](gpt_params &) { + gpt_log_set_timestamps(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_TIMESTAMPS")); + + return ctx_arg; +} diff --git a/common/arg.h b/common/arg.h new file mode 100644 index 000000000..413de2c88 --- /dev/null +++ b/common/arg.h @@ -0,0 +1,77 @@ +#pragma once + +#include "common.h" + +#include +#include +#include + +// +// CLI argument parsing +// + +struct llama_arg { + std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; + std::string help; + bool is_sparam = false; // is current arg a sampling param? + void (*handler_void) (gpt_params & params) = nullptr; + void (*handler_string) (gpt_params & params, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, int) = nullptr; + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(gpt_params & params) + ) : args(args), help(help), handler_void(handler) {} + + // support 2 values for arg + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + + llama_arg & set_examples(std::initializer_list examples); + llama_arg & set_env(const char * env); + llama_arg & set_sparam(); + bool in_example(enum llama_example ex); + bool get_value_from_env(std::string & output); + bool has_value_from_env(); + std::string to_string(); +}; + +struct gpt_params_context { + enum llama_example ex = LLAMA_EXAMPLE_COMMON; + gpt_params & params; + std::vector options; + void(*print_usage)(int, char **) = nullptr; + gpt_params_context(gpt_params & params) : params(params) {} +}; + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); + +// function to be used by test-arg-parser +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index 916b1731e..8d0ed4f95 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3,6 +3,7 @@ #endif #include "common.h" +#include "log.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" @@ -25,7 +26,7 @@ #include #include #include -#include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -49,7 +50,6 @@ #if defined(LLAMA_USE_CURL) #include #include -#include #include #endif @@ -57,14 +57,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUDA_SYCL -#endif - -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUDA_SYCL_VULKAN -#endif - #if defined(LLAMA_USE_CURL) #ifdef __linux__ #include @@ -235,7 +227,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!SetPriorityClass(GetCurrentProcess(), p)) { - fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); + LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); return false; } @@ -260,7 +252,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!setpriority(PRIO_PROCESS, 0, p)) { - fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); + LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); return false; } return true; @@ -272,53 +264,6 @@ bool set_process_priority(enum ggml_sched_priority prio) { // CLI argument parsing // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -static void gpt_params_handle_model_default(gpt_params & params) { - if (!params.hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); - } - params.hf_file = params.model; - } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); - } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); - } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; - } -} void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; @@ -340,158 +285,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) if (n_set && n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. - fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); } } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options) { - std::string arg; - const std::string arg_prefix = "--"; - gpt_sampler_params & sparams = params.sparams; - - std::unordered_map arg_to_options; - for (auto & opt : options) { - for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; - } - } - - // handle environment variables - for (auto & opt : options) { - std::string value; - if (opt.get_value_from_env(value)) { - try { - if (opt.handler_void && (value == "1" || value == "true")) { - opt.handler_void(params); - } - if (opt.handler_int) { - opt.handler_int(params, std::stoi(value)); - } - if (opt.handler_string) { - opt.handler_string(params, value); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(format( - "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); - } - } - } - - // handle command line arguments - auto check_arg = [&](int i) { - if (i+1 >= argc) { - throw std::invalid_argument("expected value for argument"); - } - }; - - for (int i = 1; i < argc; i++) { - const std::string arg_prefix = "--"; - - std::string arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); - } - auto opt = *arg_to_options[arg]; - if (opt.has_value_from_env()) { - fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); - } - try { - if (opt.handler_void) { - opt.handler_void(params); - continue; - } - - // arg with single value - check_arg(i); - std::string val = argv[++i]; - if (opt.handler_int) { - opt.handler_int(params, std::stoi(val)); - continue; - } - if (opt.handler_string) { - opt.handler_string(params, val); - continue; - } - - // arg with 2 values - check_arg(i); - std::string val2 = argv[++i]; - if (opt.handler_str_str) { - opt.handler_str_str(params, val, val2); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(format( - "error while handling argument \"%s\": %s\n\n" - "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); - } - } - - postprocess_cpu_params(params.cpuparams, nullptr); - postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); - - if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - gpt_params_handle_model_default(params); - - if (params.escape) { - string_process_escapes(params.prompt); - string_process_escapes(params.input_prefix); - string_process_escapes(params.input_suffix); - for (auto & antiprompt : params.antiprompt) { - string_process_escapes(antiprompt); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - if (sparams.seed == LLAMA_DEFAULT_SEED) { - sparams.seed = time(NULL); - } - - return true; -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector & options) { - const auto params_org = params; // the example can modify the default params - - try { - if (!gpt_params_parse_ex(argc, argv, params, options)) { - params = params_org; - return false; - } - if (params.usage) { - gpt_params_print_usage(params, options); - if (params.print_usage) { - params.print_usage(argc, argv); - } - exit(0); - } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - params = params_org; - return false; - } - - return true; -} - bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { - fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + LOG_ERR("Format of CPU range is invalid! Expected []-[].\n"); return false; } @@ -503,7 +304,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { start_i = std::stoull(range.substr(0, dash_loc)); if (start_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "Start index out of bounds!\n"); + LOG_ERR("Start index out of bounds!\n"); return false; } } @@ -513,7 +314,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { end_i = std::stoull(range.substr(dash_loc + 1)); if (end_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "End index out of bounds!\n"); + LOG_ERR("End index out of bounds!\n"); return false; } } @@ -548,7 +349,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD } else if (c >= 'A' && c <= 'F') { id -= 'A' - 10; } else { - fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i)); return false; } @@ -561,1741 +362,20 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { - std::vector result; - std::istringstream iss(input); - std::string line; - auto add_line = [&](const std::string& l) { - if (l.length() <= max_char_per_line) { - result.push_back(l); - } else { - std::istringstream line_stream(l); - std::string word, current_line; - while (line_stream >> word) { - if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { - if (!current_line.empty()) result.push_back(current_line); - current_line = word; - } else { - current_line += (!current_line.empty() ? " " : "") + word; - } - } - if (!current_line.empty()) result.push_back(current_line); +void gpt_init() { + llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { + if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { + gpt_log_add(gpt_log_main(), level, "%s", text); } - }; - while (std::getline(iss, line)) { - add_line(line); - } - return result; -} + }, NULL); -std::string llama_arg::to_string() { - // params for printing to console - const static int n_leading_spaces = 40; - const static int n_char_per_line_help = 70; // TODO: detect this based on current console - std::string leading_spaces(n_leading_spaces, ' '); - - std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { - ss << arg; - } else { - // first arg is usually abbreviation, we need padding to make it more beautiful - auto tmp = std::string(arg) + ", "; - ss << format("%-7s", tmp.c_str()); - } - } else { - ss << arg << (arg != args.back() ? ", " : ""); - } - } - if (value_hint) ss << " " << value_hint; - if (value_hint_2) ss << " " << value_hint_2; - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); - } - const auto help_lines = break_str_into_lines(help, n_char_per_line_help); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; - } - return ss.str(); -} - -void gpt_params_print_usage(gpt_params & params, std::vector & options) { - auto print_options = [](std::vector & options) { - for (llama_arg * opt : options) { - printf("%s", opt->to_string().c_str()); - } - }; - - std::vector common_options; - std::vector specific_options; - for (auto & opt : options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.in_example(params.curr_ex)) { - specific_options.push_back(&opt); - } else { - common_options.push_back(&opt); - } - } - printf("----- common options -----\n\n"); - print_options(common_options); - // TODO: maybe convert enum llama_example to string - printf("\n\n----- example-specific options -----\n\n"); - print_options(specific_options); -} - -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex) { - return gpt_params_parser_init(params, ex, nullptr); -} - -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage) { - std::vector options; - params.print_usage = print_usage; - params.curr_ex = ex; - - std::string sampler_type_chars; - std::string sampler_type_names; - for (const auto & sampler : params.sparams.samplers) { - sampler_type_chars += gpt_sampler_type_to_chr(sampler); - sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; - } - sampler_type_names.pop_back(); - - - /** - * filter options by example - * rules: - * - all examples inherit options from LLAMA_EXAMPLE_COMMON - * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example - * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example - */ - auto add_opt = [&](llama_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { - options.push_back(std::move(arg)); - } - }; - - - add_opt(llama_arg( - {"-h", "--help", "--usage"}, - "print usage and exit", - [](gpt_params & params) { - params.usage = true; - } - )); - add_opt(llama_arg( - {"--version"}, - "show version and build info", - [](gpt_params &) { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - )); - add_opt(llama_arg( - {"-v", "--verbose"}, - "print verbose information", - [](gpt_params & params) { - params.verbosity = 1; - } - )); - add_opt(llama_arg( - {"--verbosity"}, "N", - format("set specific verbosity level (default: %d)", params.verbosity), - [](gpt_params & params, int value) { - params.verbosity = value; - } - )); - add_opt(llama_arg( - {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params) { - params.verbose_prompt = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params) { - params.display_prompt = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params) { - params.use_color = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), - [](gpt_params & params, const std::string & value) { - params.sparams.seed = std::stoul(value); - } - )); - add_opt(llama_arg( - {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, int value) { - params.cpuparams.n_threads = value; - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_env("LLAMA_ARG_THREADS")); - add_opt(llama_arg( - {"-tb", "--threads-batch"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, int value) { - params.cpuparams_batch.n_threads = value; - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - )); - add_opt(llama_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, int value) { - params.draft_cpuparams.n_threads = value; - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.n_threads = value; - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-C", "--cpu-mask"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, const std::string & mask) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(llama_arg( - {"-Cr", "--cpu-range"}, "lo-hi", - "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, const std::string & range) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(llama_arg( - {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, const std::string & value) { - params.cpuparams.strict_cpu = std::stoul(value); - } - )); - add_opt(llama_arg( - {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(llama_arg( - {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, const std::string & value) { - params.cpuparams.poll = std::stoul(value); - } - )); - add_opt(llama_arg( - {"-Cb", "--cpu-mask-batch"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(llama_arg( - {"-Crb", "--cpu-range-batch"}, "lo-hi", - "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, const std::string & range) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(llama_arg( - {"--cpu-strict-batch"}, "<0|1>", - "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, int value) { - params.cpuparams_batch.strict_cpu = value; - } - )); - add_opt(llama_arg( - {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(llama_arg( - {"--poll-batch"}, "<0|1>", - "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, int value) { - params.cpuparams_batch.poll = value; - } - )); - add_opt(llama_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, const std::string & range) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, int value) { - params.draft_cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, int value) { - params.draft_cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, const std::string & range) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, int value) { - params.n_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, const std::string & value) { - params.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-lcs", "--lookup-cache-static"}, "FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, const std::string & value) { - params.lookup_cache_static = value; - } - )); - add_opt(llama_arg( - {"-lcd", "--lookup-cache-dynamic"}, "FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, const std::string & value) { - params.lookup_cache_dynamic = value; - } - )); - add_opt(llama_arg( - {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, int value) { - params.n_ctx = value; - } - ).set_env("LLAMA_ARG_CTX_SIZE")); - add_opt(llama_arg( - {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, int value) { - params.n_predict = value; - } - ).set_env("LLAMA_ARG_N_PREDICT")); - add_opt(llama_arg( - {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, int value) { - params.n_batch = value; - } - ).set_env("LLAMA_ARG_BATCH")); - add_opt(llama_arg( - {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, int value) { - params.n_ubatch = value; - } - ).set_env("LLAMA_ARG_UBATCH")); - add_opt(llama_arg( - {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, int value) { - params.n_keep = value; - } - )); - add_opt(llama_arg( - {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, int value) { - params.n_chunks = value; - } - )); - add_opt(llama_arg( - {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params) { - params.flash_attn = true; - } - ).set_env("LLAMA_ARG_FLASH_ATTN")); - add_opt(llama_arg( - {"-p", "--prompt"}, "PROMPT", - ex == LLAMA_EXAMPLE_MAIN - ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" - : "prompt to start generation with", - [](gpt_params & params, const std::string & value) { - params.prompt = value; - } - )); - add_opt(llama_arg( - {"-f", "--file"}, "FNAME", - "a file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } - )); - add_opt(llama_arg( - {"--in-file"}, "FNAME", - "an input file (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - params.in_files.push_back(value); - } - )); - add_opt(llama_arg( - {"-bf", "--binary-file"}, "FNAME", - "binary file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); - } - )); - add_opt(llama_arg( - {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params) { - params.escape = true; - } - )); - add_opt(llama_arg( - {"--no-escape"}, - "do not process escape sequences", - [](gpt_params & params) { - params.escape = false; - } - )); - add_opt(llama_arg( - {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, int value) { - params.n_print = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache"}, "FNAME", - "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, const std::string & value) { - params.path_prompt_cache = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache-all"}, - "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params) { - params.prompt_cache_all = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache-ro"}, - "if specified, uses the prompt cache but does not update it", - [](gpt_params & params) { - params.prompt_cache_ro = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-r", "--reverse-prompt"}, "PROMPT", - "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, const std::string & value) { - params.antiprompt.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params) { - params.special = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-cnv", "--conversation"}, - format( - "run in conversation mode:\n" - "- does not print special tokens and suffix/prefix\n" - "- interactive mode is also enabled\n" - "(default: %s)", - params.conversation ? "true" : "false" - ), - [](gpt_params & params) { - params.conversation = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params) { - params.interactive = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params) { - params.interactive_first = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-mli", "--multiline-input"}, - "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params) { - params.multiline_input = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-prefix-bos"}, - "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params) { - params.input_prefix_bos = true; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-prefix"}, "STRING", - "string to prefix user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { - params.input_prefix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-suffix"}, "STRING", - "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { - params.input_suffix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--no-warmup"}, - "skip warming up the model with an empty run", - [](gpt_params & params) { - params.warmup = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--spm-infill"}, - format( - "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", - params.spm_infill ? "enabled" : "disabled" - ), - [](gpt_params & params) { - params.spm_infill = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, const std::string & value) { - const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); - } - )); - add_opt(llama_arg( - {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.samplers = gpt_sampler_types_from_chars(value); - } - )); - add_opt(llama_arg( - {"--ignore-eos"}, - "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params) { - params.sparams.ignore_eos = true; - } - )); - add_opt(llama_arg( - {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params) { - params.sparams.penalize_nl = true; - } - )); - add_opt(llama_arg( - {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), - [](gpt_params & params, const std::string & value) { - params.sparams.temp = std::stof(value); - params.sparams.temp = std::max(params.sparams.temp, 0.0f); - } - )); - add_opt(llama_arg( - {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), - [](gpt_params & params, int value) { - params.sparams.top_k = value; - } - )); - add_opt(llama_arg( - {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), - [](gpt_params & params, const std::string & value) { - params.sparams.top_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), - [](gpt_params & params, const std::string & value) { - params.sparams.min_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), - [](gpt_params & params, const std::string & value) { - params.sparams.tfs_z = std::stof(value); - } - )); - add_opt(llama_arg( - {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), - [](gpt_params & params, const std::string & value) { - params.sparams.typ_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), - [](gpt_params & params, int value) { - params.sparams.penalty_last_n = value; - params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); - } - )); - add_opt(llama_arg( - {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_repeat = std::stof(value); - } - )); - add_opt(llama_arg( - {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_present = std::stof(value); - } - )); - add_opt(llama_arg( - {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_freq = std::stof(value); - } - )); - add_opt(llama_arg( - {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), - [](gpt_params & params, const std::string & value) { - params.sparams.dynatemp_range = std::stof(value); - } - )); - add_opt(llama_arg( - {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), - [](gpt_params & params, const std::string & value) { - params.sparams.dynatemp_exponent = std::stof(value); - } - )); - add_opt(llama_arg( - {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), - [](gpt_params & params, int value) { - params.sparams.mirostat = value; - } - )); - add_opt(llama_arg( - {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), - [](gpt_params & params, const std::string & value) { - params.sparams.mirostat_eta = std::stof(value); - } - )); - add_opt(llama_arg( - {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), - [](gpt_params & params, const std::string & value) { - params.sparams.mirostat_tau = std::stof(value); - } - )); - add_opt(llama_arg( - {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", - "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, const std::string & value) { - std::stringstream ss(value); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sparams.logit_bias.push_back({key, bias}); - } else { - throw std::invalid_argument("invalid input format"); - } - } catch (const std::exception&) { - throw std::invalid_argument("invalid input format"); - } - } - )); - add_opt(llama_arg( - {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.grammar = value; - } - )); - add_opt(llama_arg( - {"--grammar-file"}, "FNAME", - "file to read grammar from", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(params.sparams.grammar) - ); - } - )); - add_opt(llama_arg( - {"-j", "--json-schema"}, "SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, const std::string & value) { - params.sparams.grammar = json_schema_to_grammar(json::parse(value)); - } - )); - add_opt(llama_arg( - {"--pooling"}, "{none,mean,cls,last}", - "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--attention"}, "{causal,non,causal}", - "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--rope-scaling"}, "{none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { throw std::invalid_argument("invalid value"); } - } - )); - add_opt(llama_arg( - {"--rope-scale"}, "N", - "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, const std::string & value) { - params.rope_freq_scale = 1.0f / std::stof(value); - } - )); - add_opt(llama_arg( - {"--rope-freq-base"}, "N", - "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, const std::string & value) { - params.rope_freq_base = std::stof(value); - } - )); - add_opt(llama_arg( - {"--rope-freq-scale"}, "N", - "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, const std::string & value) { - params.rope_freq_scale = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, int value) { - params.yarn_orig_ctx = value; - } - )); - add_opt(llama_arg( - {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, const std::string & value) { - params.yarn_ext_factor = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, const std::string & value) { - params.yarn_attn_factor = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, const std::string & value) { - params.yarn_beta_slow = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, const std::string & value) { - params.yarn_beta_fast = std::stof(value); - } - )); - add_opt(llama_arg( - {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, int value) { - params.grp_attn_n = value; - } - )); - add_opt(llama_arg( - {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, int value) { - params.grp_attn_w = value; - } - )); - add_opt(llama_arg( - {"-dkvc", "--dump-kv-cache"}, - "verbose print of the KV cache", - [](gpt_params & params) { - params.dump_kv_cache = true; - } - )); - add_opt(llama_arg( - {"-nkvo", "--no-kv-offload"}, - "disable KV offload", - [](gpt_params & params) { - params.no_kv_offload = true; - } - )); - add_opt(llama_arg( - {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_k = value; - } - )); - add_opt(llama_arg( - {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_v = value; - } - )); - add_opt(llama_arg( - {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params) { - params.logits_all = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--hellaswag"}, - "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.hellaswag = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, int value) { - params.hellaswag_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--winogrande"}, - "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.winogrande = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, int value) { - params.winogrande_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--multiple-choice"}, - "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.multiple_choice = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, int value) { - params.multiple_choice_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--kl-divergence"}, - "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params) { - params.kl_divergence = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--save-all-logits", "--kl-divergence-base"}, "FNAME", - "set logits file", - [](gpt_params & params, const std::string & value) { - params.logits_file = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, int value) { - params.ppl_stride = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, int value) { - params.ppl_output_type = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); - } - ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(llama_arg( - {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { - params.n_parallel = value; - } - )); - add_opt(llama_arg( - {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, int value) { - params.n_sequences = value; - } - )); - add_opt(llama_arg( - {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params) { - params.cont_batching = true; - } - ).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(llama_arg( - {"-nocb", "--no-cont-batching"}, - "disable continuous batching", - [](gpt_params & params) { - params.cont_batching = false; - } - ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(llama_arg( - {"--mmproj"}, "FILE", - "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, const std::string & value) { - params.mmproj = value; - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(llama_arg( - {"--image"}, "FILE", - "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, const std::string & value) { - params.image.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); -#ifdef GGML_USE_RPC - add_opt(llama_arg( - {"--rpc"}, "SERVERS", - "comma separated list of RPC servers", - [](gpt_params & params, const std::string & value) { - params.rpc_servers = value; - } - )); +#ifdef NDEBUG + const char * build_type = ""; +#else + const char * build_type = " (debug)"; #endif - add_opt(llama_arg( - {"--mlock"}, - "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params) { - params.use_mlock = true; - } - )); - add_opt(llama_arg( - {"--no-mmap"}, - "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params) { - params.use_mmap = false; - } - )); - add_opt(llama_arg( - {"--numa"}, "TYPE", - "attempt optimizations that help on some NUMA systems\n" - "- distribute: spread execution evenly over all nodes\n" - "- isolate: only spawn threads on CPUs on the node that execution started on\n" - "- numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { throw std::invalid_argument("invalid value"); } - } - )); - add_opt(llama_arg( - {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", - "number of layers to store in VRAM", - [](gpt_params & params, int value) { - params.n_gpu_layers = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(llama_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](gpt_params & params, int value) { - params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-sm", "--split-mode"}, "{none,layer,row}", - "how to split the model across multiple GPUs, one of:\n" - "- none: use one GPU only\n" - "- layer (default): split layers and KV across GPUs\n" - "- row: split rows across GPUs", - [](gpt_params & params, const std::string & value) { - std::string arg_next = value; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { - throw std::invalid_argument("invalid value"); - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"-ts", "--tensor-split"}, "N0,N1,N2,...", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, const std::string & value) { - std::string arg_next = value; - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) - ); - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, int value) { - params.main_gpu = value; -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params) { - params.check_tensors = true; - } - )); - add_opt(llama_arg( - {"--override-kv"}, "KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); - } - } - )); - add_opt(llama_arg( - {"--lora"}, "FNAME", - "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0 }); - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"--lora-scaled"}, "FNAME", "SCALE", - "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale) }); - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"--control-vector"}, "FNAME", - "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); - } - )); - add_opt(llama_arg( - {"--control-vector-scaled"}, "FNAME", "SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); - } - )); - add_opt(llama_arg( - {"--control-vector-layer-range"}, "START", "END", - "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, const std::string & start, const std::string & end) { - params.control_vector_layer_start = std::stoi(start); - params.control_vector_layer_end = std::stoi(end); - } - )); - add_opt(llama_arg( - {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", - [](gpt_params & params, const std::string & value) { - params.model_alias = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"-m", "--model"}, "FNAME", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? std::string("model path from which to load base model") - : format( - "model path (default: `models/$filename` with filename from `--hf-file` " - "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH - ), - [](gpt_params & params, const std::string & value) { - params.model = value; - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(llama_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](gpt_params & params, const std::string & value) { - params.model_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-mu", "--model-url"}, "MODEL_URL", - "model download url (default: unused)", - [](gpt_params & params, const std::string & value) { - params.model_url = value; - } - ).set_env("LLAMA_ARG_MODEL_URL")); - add_opt(llama_arg( - {"-hfr", "--hf-repo"}, "REPO", - "Hugging Face model repository (default: unused)", - [](gpt_params & params, const std::string & value) { - params.hf_repo = value; - } - ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(llama_arg( - {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file (default: unused)", - [](gpt_params & params, const std::string & value) { - params.hf_file = value; - } - ).set_env("LLAMA_ARG_HF_FILE")); - add_opt(llama_arg( - {"-hft", "--hf-token"}, "TOKEN", - "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, const std::string & value) { - params.hf_token = value; - } - ).set_env("HF_TOKEN")); - add_opt(llama_arg( - {"--context-file"}, "FNAME", - "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - params.context_files.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, int value) { - params.chunk_size = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, const std::string & value) { - params.chunk_separator = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, int value) { - params.n_junk = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( - {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, int value) { - params.i_pos = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( - {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR - ? params.cvector_outfile.c_str() - : params.out_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, int value) { - params.n_out_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, int value) { - params.n_save_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params) { - params.process_output = true; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params) { - params.compute_ppl = false; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, int value) { - params.i_chunk = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params) { - params.is_pp_shared = true; - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-npp"}, "n0,n1,...", - "number of prompt tokens", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-ntg"}, "n0,n1,...", - "number of text generation tokens", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-npl"}, "n0,n1,...", - "number of parallel prompts", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, int value) { - params.embd_normalize = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--embd-output-format"}, "FORMAT", - "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, const std::string & value) { - params.embd_out = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--embd-separator"}, "STRING", - "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, const std::string & value) { - params.embd_sep = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, const std::string & value) { - params.hostname = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); - add_opt(llama_arg( - {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), - [](gpt_params & params, int value) { - params.port = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); - add_opt(llama_arg( - {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, const std::string & value) { - params.public_path = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params) { - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); - add_opt(llama_arg( - {"--api-key"}, "KEY", - "API key to use for authentication (default: none)", - [](gpt_params & params, const std::string & value) { - params.api_keys.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); - add_opt(llama_arg( - {"--api-key-file"}, "FNAME", - "path to file containing API keys (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream key_file(value); - if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); - } - } - key_file.close(); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--ssl-key-file"}, "FNAME", - "path to file a PEM-encoded SSL private key", - [](gpt_params & params, const std::string & value) { - params.ssl_file_key = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--ssl-cert-file"}, "FNAME", - "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, const std::string & value) { - params.ssl_file_cert = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, int value) { - params.timeout_read = value; - params.timeout_write = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, int value) { - params.n_threads_http = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); - add_opt(llama_arg( - {"-spf", "--system-prompt-file"}, "FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--log-format"}, "{text, json}", - "log output format: json or text (default: json)", - [](gpt_params & params, const std::string & value) { - if (value == "json") { - params.log_json = true; - } else if (value == "text") { - params.log_json = false; - } else { - throw std::invalid_argument("invalid value"); - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params) { - params.endpoint_metrics = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); - add_opt(llama_arg( - {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params) { - params.endpoint_slots = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); - add_opt(llama_arg( - {"--slot-save-path"}, "PATH", - "path to save slot kv cache (default: disabled)", - [](gpt_params & params, const std::string & value) { - params.slot_save_path = value; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--chat-template"}, "JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { - if (!llama_chat_verify_template(value)) { - throw std::runtime_error(format( - "error: the supplied chat template is not supported: %s\n" - "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", - value.c_str() - )); - } - params.chat_template = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); - add_opt(llama_arg( - {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, const std::string & value) { - params.slot_prompt_similarity = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params) { - params.lora_init_without_apply = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--simple-io"}, - "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params) { - params.simple_io = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"-ld", "--logdir"}, "LOGDIR", - "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, const std::string & value) { - params.logdir = value; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - )); - add_opt(llama_arg( - {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.cvector_positive_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.cvector_negative_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, int value) { - params.n_pca_batch = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, int value) { - params.n_pca_iterations = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--method"}, "{pca, mean}", - "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--output-format"}, "{md,jsonl}", - "output format for batched-bench results (default: md)", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); -#ifndef LOG_DISABLE_LOGS - // TODO: make this looks less weird - add_opt(llama_arg( - {"--log-test"}, - "Log test", - [](gpt_params &) { log_param_single_parse("--log-test"); } - )); - add_opt(llama_arg( - {"--log-disable"}, - "Log disable", - [](gpt_params &) { log_param_single_parse("--log-disable"); } - )); - add_opt(llama_arg( - {"--log-enable"}, - "Log enable", - [](gpt_params &) { log_param_single_parse("--log-enable"); } - )); - add_opt(llama_arg( - {"--log-new"}, - "Log new", - [](gpt_params &) { log_param_single_parse("--log-new"); } - )); - add_opt(llama_arg( - {"--log-append"}, - "Log append", - [](gpt_params &) { log_param_single_parse("--log-append"); } - )); - add_opt(llama_arg( - {"--log-file"}, "FNAME", - "Log file", - [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } - )); -#endif // LOG_DISABLE_LOGS - - return options; + LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); } std::string gpt_params_get_system_info(const gpt_params & params) { @@ -2378,6 +458,94 @@ void string_replace_all(std::string & s, const std::string & search, const std:: s = std::move(builder); } +std::string string_from(bool value) { + return value ? "true" : "false"; +} + +std::string string_from(const std::vector & values) { + std::stringstream buf; + + buf << "[ "; + bool first = true; + for (auto e : values) { + if (first) { + first = false; + } else { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (const auto & token : tokens) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, token); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "'" << detokenized << "'" + << ":" << std::to_string(token); + } + + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (int i = 0; i < batch.n_tokens; ++i) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, batch.token[i]); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "\n" << std::to_string(i) + << ":token '" << detokenized << "'" + << ":pos " << std::to_string(batch.pos[i]) + << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ":seq_id " << std::to_string(batch.seq_id[i][0]) + << ":logits " << std::to_string(batch.logits[i]); + } + + buf << " ]"; + + return buf.str(); +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2418,7 +586,7 @@ void string_process_escapes(std::string & input) { bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); return false; } llama_model_kv_override kvo; @@ -2441,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector 127) { - fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; } strncpy(kvo.val_str, sep, 127); kvo.val_str[127] = '\0'; } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data); return false; } overrides.emplace_back(std::move(kvo)); @@ -2666,7 +834,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); return iparams; } @@ -2674,7 +842,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return iparams; } @@ -2710,7 +878,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { loaded_la.scale = la.scale; loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); if (loaded_la.adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return iparams; @@ -2722,12 +890,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (params.sparams.ignore_eos && llama_token_eos(model) == -1) { - fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); + LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); params.sparams.ignore_eos = false; } if (params.warmup) { - LOG("warming up the model with an empty run\n"); + LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); std::vector tmp; llama_token bos = llama_token_bos(model); @@ -2757,7 +925,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_reset(lctx); } iparams.model = model; @@ -2853,6 +1021,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); @@ -2878,17 +1047,44 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #ifdef LLAMA_USE_CURL +#define CURL_MAX_RETRY 3 +#define CURL_RETRY_DELAY_SECONDS 2 + + static bool starts_with(const std::string & str, const std::string & prefix) { // While we wait for C++20's std::string::starts_with... return str.rfind(prefix, 0) == 0; } +static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { + int remaining_attempts = max_attempts; + + while (remaining_attempts > 0) { + LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + + CURLcode res = curl_easy_perform(curl); + if (res == CURLE_OK) { + return true; + } + + int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; + LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + + remaining_attempts--; + std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); + } + + LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + + return false; +} + static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); + LOG_ERR("%s: error initializing libcurl\n", __func__); return false; } @@ -2929,11 +1125,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat if (metadata_in.good()) { try { metadata_in >> metadata; - fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); if (metadata.contains("url") && metadata.at("url").is_string()) { auto previous_url = metadata.at("url").get(); if (previous_url != url) { - fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); return false; } } @@ -2944,12 +1140,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { - fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); return false; } } } else { - fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); + LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers @@ -2986,9 +1182,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - CURLcode res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } @@ -2998,26 +1193,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat // HEAD not supported, we don't know if the file has changed // force trigger downloading force_download = true; - fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } bool should_download = !file_exists || force_download; if (!should_download) { if (!etag.empty() && etag != headers.etag) { - fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); should_download = true; } else if (!last_modified.empty() && last_modified != headers.last_modified) { - fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); should_download = true; } } if (should_download) { std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); if (remove(path.c_str()) != 0) { - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); return false; } } @@ -3032,7 +1227,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); + LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; } @@ -3063,18 +1258,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat }; // start the download - fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - auto res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } long http_code = 0; curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); return false; } @@ -3088,10 +1282,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat {"lastModified", headers.last_modified} }); std::ofstream(metadata_path) << metadata.dump(4); - fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); if (rename(path_temporary.c_str(), path.c_str()) != 0) { - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } } @@ -3106,7 +1300,7 @@ struct llama_model * llama_load_model_from_url( const struct llama_model_params & params) { // Basic validation of the model_url if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); + LOG_ERR("%s: invalid model_url\n", __func__); return NULL; } @@ -3123,7 +1317,7 @@ struct llama_model * llama_load_model_from_url( }; auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); if (!ctx_gguf) { - fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); return NULL; } @@ -3143,14 +1337,12 @@ struct llama_model * llama_load_model_from_url( // and extract split URL and PATH prefixes { if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model file name: %s" - " n_split=%d\n", __func__, path_model, n_split); + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); return NULL; } if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model url: %s" - " n_split=%d\n", __func__, model_url, n_split); + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); return NULL; } } @@ -3210,7 +1402,7 @@ struct llama_model * llama_load_model_from_url( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } @@ -3220,7 +1412,7 @@ struct llama_model * llama_load_model_from_hf( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; } @@ -3548,13 +1740,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); return result; } int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { - fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); } for (int i = 0; i < n_tensors; i++) { @@ -3572,23 +1764,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } if (layer_idx < 0) { - fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } else if (layer_idx == 0) { - fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); if (tensor->type != GGML_TYPE_F32) { - fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } if (ggml_n_dims(tensor) != 1) { - fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -3596,7 +1788,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor); } else if (ggml_nelements(tensor) != result.n_embd) { - fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -3613,7 +1805,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } if (result.n_embd == -1) { - fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); result.data.clear(); } @@ -3634,7 +1826,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector #include #include -#include -#include -#include -#include -#include -#include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -56,11 +45,20 @@ struct llama_control_vector_load_info; // CPU utils // +struct cpu_params { + int n_threads = -1; + bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. + bool mask_valid = false; // Default: any CPU + enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + bool strict_cpu = false; // Use strict CPU placement + uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) +}; + int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_math(); // -// CLI argument parsing +// Common params // enum llama_example { @@ -78,28 +76,72 @@ enum llama_example { LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_LLAVA, + LLAMA_EXAMPLE_LOOKUP, + LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_COUNT, }; +enum gpt_sampler_type { + GPT_SAMPLER_TYPE_NONE = 0, + GPT_SAMPLER_TYPE_TOP_K = 1, + GPT_SAMPLER_TYPE_TOP_P = 2, + GPT_SAMPLER_TYPE_MIN_P = 3, + GPT_SAMPLER_TYPE_TFS_Z = 4, + GPT_SAMPLER_TYPE_TYPICAL_P = 5, + GPT_SAMPLER_TYPE_TEMPERATURE = 6, +}; + // dimensionality reduction methods, used by cvector-generator enum dimre_method { DIMRE_METHOD_PCA, DIMRE_METHOD_MEAN, }; -struct cpu_params { - int n_threads = -1; - bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. - bool mask_valid = false; // Default: any CPU - enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) - bool strict_cpu = false; // Use strict CPU placement - uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) +// sampler parameters +struct gpt_sampler_params { + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler + + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + + std::vector samplers = { + GPT_SAMPLER_TYPE_TOP_K, + GPT_SAMPLER_TYPE_TFS_Z, + GPT_SAMPLER_TYPE_TYPICAL_P, + GPT_SAMPLER_TYPE_TOP_P, + GPT_SAMPLER_TYPE_MIN_P, + GPT_SAMPLER_TYPE_TEMPERATURE + }; + + std::string grammar; // optional BNF-like grammar to constrain sampling + + std::vector logit_bias; // logit biases to apply + + // print the parameters into a string + std::string print() const; }; struct gpt_params { - enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON; - int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -143,23 +185,23 @@ struct gpt_params { struct gpt_sampler_params sparams; - std::string model = ""; // model path - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_token = ""; // HF token - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with - std::string logdir = ""; // directory in which to save YAML log files - std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding - std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding - std::string logits_file = ""; // file for saving *all* logits - std::string rpc_servers = ""; // comma separated list of RPC servers + std::string model = ""; // model path // NOLINT + std::string model_draft = ""; // draft model for speculative decoding // NOLINT + std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_url = ""; // model url to download // NOLINT + std::string hf_token = ""; // HF token // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + std::string prompt = ""; // NOLINT + std::string prompt_file = ""; // store the external prompt file name // NOLINT + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT + std::string input_prefix = ""; // string to prefix user inputs with // NOLINT + std::string input_suffix = ""; // string to suffix user inputs with // NOLINT + std::string logdir = ""; // directory in which to save YAML log files // NOLINT + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT + std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) @@ -189,7 +231,6 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence - std::function print_usage = nullptr; // print example-specific usage and example bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output @@ -204,6 +245,8 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool no_perf = false; // disable performance metrics + bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch @@ -220,7 +263,7 @@ struct gpt_params { std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector + std::string mmproj = ""; // path to multimodal projector // NOLINT std::vector image; // path to image file(s) // embedding @@ -236,15 +279,15 @@ struct gpt_params { int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; - std::string public_path = ""; - std::string chat_template = ""; - std::string system_prompt = ""; + std::string public_path = ""; // NOLINT + std::string chat_template = ""; // NOLINT + std::string system_prompt = ""; // NOLINT bool enable_chat_template = true; std::vector api_keys; - std::string ssl_file_key = ""; - std::string ssl_file_cert = ""; + std::string ssl_file_key = ""; // NOLINT + std::string ssl_file_cert = ""; // NOLINT bool endpoint_slots = true; bool endpoint_metrics = false; @@ -299,91 +342,9 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; -struct llama_arg { - std::set examples = {LLAMA_EXAMPLE_COMMON}; - std::vector args; - const char * value_hint = nullptr; // help text or example for arg value - const char * value_hint_2 = nullptr; // for second arg value - const char * env = nullptr; - std::string help; - void (*handler_void) (gpt_params & params) = nullptr; - void (*handler_string) (gpt_params & params, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, int) = nullptr; - - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(gpt_params & params, const std::string &) - ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(gpt_params & params, int) - ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - - llama_arg( - const std::initializer_list & args, - const std::string & help, - void (*handler)(gpt_params & params) - ) : args(args), help(help), handler_void(handler) {} - - // support 2 values for arg - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const char * value_hint_2, - const std::string & help, - void (*handler)(gpt_params & params, const std::string &, const std::string &) - ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - - llama_arg & set_examples(std::initializer_list examples) { - this->examples = std::move(examples); - return *this; - } - - llama_arg & set_env(const char * env) { - help = help + "\n(env: " + env + ")"; - this->env = env; - return *this; - } - - bool in_example(enum llama_example ex) { - return examples.find(ex) != examples.end(); - } - - bool get_value_from_env(std::string & output) const { - if (env == nullptr) return false; - char * value = std::getenv(env); - if (value) { - output = value; - return true; - } - return false; - } - - bool has_value_from_env() const { - return env != nullptr && std::getenv(env); - } - - std::string to_string(); -}; - -// initialize list of options (arguments) that can be used by the current example -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); -// optionally, we can provide "print_usage" to print example usage -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage); - -// parse input arguments from CLI -// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options); - -// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set -void gpt_params_print_usage(gpt_params & params, std::vector & options); +// call once at the start of a program if it uses libcommon +// initializes the logging system and prints info about the build +void gpt_init(); std::string gpt_params_get_system_info(const gpt_params & params); @@ -420,6 +381,11 @@ static std::vector string_split(const std::string & str, char delim) { bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); +std::string string_from(bool value); +std::string string_from(const std::vector & values); +std::string string_from(const struct llama_context * ctx, const std::vector & tokens); +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); + // // Filesystem utils // diff --git a/common/log.cpp b/common/log.cpp new file mode 100644 index 000000000..2825a227e --- /dev/null +++ b/common/log.cpp @@ -0,0 +1,401 @@ +#include "log.h" + +#include +#include +#include +#include +#include +#include +#include + +int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; + +void gpt_log_set_verbosity_thold(int verbosity) { + gpt_log_verbosity_thold = verbosity; +} + +#define LOG_COL_DEFAULT "\033[0m" +#define LOG_COL_BOLD "\033[1m" +#define LOG_COL_RED "\033[31m" +#define LOG_COL_GREEN "\033[32m" +#define LOG_COL_YELLOW "\033[33m" +#define LOG_COL_BLUE "\033[34m" +#define LOG_COL_MAGENTA "\033[35m" +#define LOG_COL_CYAN "\033[36m" +#define LOG_COL_WHITE "\033[37m" + +static int64_t t_us() { + return std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); +} + +// colors +enum gpt_log_col : int { + GPT_LOG_COL_DEFAULT = 0, + GPT_LOG_COL_BOLD, + GPT_LOG_COL_RED, + GPT_LOG_COL_GREEN, + GPT_LOG_COL_YELLOW, + GPT_LOG_COL_BLUE, + GPT_LOG_COL_MAGENTA, + GPT_LOG_COL_CYAN, + GPT_LOG_COL_WHITE, +}; + +// disable colors by default +static std::vector g_col = { + "", + "", + "", + "", + "", + "", + "", + "", + "", +}; + +struct gpt_log_entry { + enum ggml_log_level level; + + bool prefix; + + int64_t timestamp; + + std::vector msg; + + // signals the worker thread to stop + bool is_end; + + void print(FILE * file = nullptr) const { + FILE * fcur = file; + if (!fcur) { + // stderr displays DBG messages only when their verbosity level is not higher than the threshold + // these messages will still be logged to a file + if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + return; + } + + fcur = stdout; + + if (level != GGML_LOG_LEVEL_NONE) { + fcur = stderr; + } + } + + if (level != GGML_LOG_LEVEL_NONE && prefix) { + if (timestamp) { + // [M.s.ms.us] + fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", + g_col[GPT_LOG_COL_BLUE], + (int) (timestamp / 1000000 / 60), + (int) (timestamp / 1000000 % 60), + (int) (timestamp / 1000 % 1000), + (int) (timestamp % 1000), + g_col[GPT_LOG_COL_DEFAULT]); + } + + switch (level) { + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + default: + break; + } + } + + fprintf(fcur, "%s", msg.data()); + + if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { + fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + } + + fflush(fcur); + } +}; + +struct gpt_log { + // default capacity - will be expanded if needed + gpt_log() : gpt_log(256) {} + + gpt_log(size_t capacity) { + file = nullptr; + prefix = false; + timestamps = false; + running = false; + t_start = t_us(); + + // initial message size - will be expanded if longer messages arrive + entries.resize(capacity); + for (auto & entry : entries) { + entry.msg.resize(256); + } + + head = 0; + tail = 0; + + resume(); + } + + ~gpt_log() { + pause(); + if (file) { + fclose(file); + } + } + +private: + std::mutex mtx; + std::thread thrd; + std::condition_variable cv; + + FILE * file; + + bool prefix; + bool timestamps; + bool running; + + int64_t t_start; + + // ring buffer of entries + std::vector entries; + size_t head; + size_t tail; + + // worker thread copies into this + gpt_log_entry cur; + +public: + void add(enum ggml_log_level level, const char * fmt, va_list args) { + std::lock_guard lock(mtx); + + if (!running) { + // discard messages while the worker thread is paused + return; + } + + auto & entry = entries[tail]; + + { + // cannot use args twice, so make a copy in case we need to expand the buffer + va_list args_copy; + va_copy(args_copy, args); + +#if 1 + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy); + } +#else + // hack for bolding arguments + + std::stringstream ss; + for (int i = 0; fmt[i] != 0; i++) { + if (fmt[i] == '%') { + ss << LOG_COL_BOLD; + while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++]; + ss << LOG_COL_DEFAULT; + if (fmt[i] == 0) break; + } + ss << fmt[i]; + } + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy); + } +#endif + } + + entry.level = level; + entry.prefix = prefix; + entry.timestamp = 0; + if (timestamps) { + entry.timestamp = t_us() - t_start; + } + entry.is_end = false; + + tail = (tail + 1) % entries.size(); + if (tail == head) { + // expand the buffer + std::vector new_entries(2*entries.size()); + + size_t new_tail = 0; + + do { + new_entries[new_tail] = std::move(entries[head]); + + head = (head + 1) % entries.size(); + new_tail = (new_tail + 1); + } while (head != tail); + + head = 0; + tail = new_tail; + + for (size_t i = tail; i < new_entries.size(); i++) { + new_entries[i].msg.resize(256); + } + + entries = std::move(new_entries); + } + + cv.notify_one(); + } + + void resume() { + std::lock_guard lock(mtx); + + if (running) { + return; + } + + running = true; + + thrd = std::thread([this]() { + while (true) { + { + std::unique_lock lock(mtx); + cv.wait(lock, [this]() { return head != tail; }); + + cur = entries[head]; + + head = (head + 1) % entries.size(); + } + + if (cur.is_end) { + break; + } + + cur.print(); // stdout and stderr + + if (file) { + cur.print(file); + } + } + }); + } + + void pause() { + { + std::lock_guard lock(mtx); + + if (!running) { + return; + } + + running = false; + + // push an entry to signal the worker thread to stop + { + auto & entry = entries[tail]; + entry.is_end = true; + + tail = (tail + 1) % entries.size(); + } + + cv.notify_one(); + } + + thrd.join(); + } + + void set_file(const char * path) { + pause(); + + if (file) { + fclose(file); + } + + if (path) { + file = fopen(path, "w"); + } else { + file = nullptr; + } + + resume(); + } + + void set_colors(bool colors) { + pause(); + + if (colors) { + g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[GPT_LOG_COL_RED] = LOG_COL_RED; + g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + } else { + for (size_t i = 0; i < g_col.size(); i++) { + g_col[i] = ""; + } + } + + resume(); + } + + void set_prefix(bool prefix) { + std::lock_guard lock(mtx); + + this->prefix = prefix; + } + + void set_timestamps(bool timestamps) { + std::lock_guard lock(mtx); + + this->timestamps = timestamps; + } +}; + +// +// public API +// + +struct gpt_log * gpt_log_init() { + return new gpt_log; +} + +struct gpt_log * gpt_log_main() { + static struct gpt_log log; + + return &log; +} + +void gpt_log_pause(struct gpt_log * log) { + log->pause(); +} + +void gpt_log_resume(struct gpt_log * log) { + log->resume(); +} + +void gpt_log_free(struct gpt_log * log) { + delete log; +} + +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + log->add(level, fmt, args); + va_end(args); +} + +void gpt_log_set_file(struct gpt_log * log, const char * file) { + log->set_file(file); +} + +void gpt_log_set_colors(struct gpt_log * log, bool colors) { + log->set_colors(colors); +} + +void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { + log->set_prefix(prefix); +} + +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { + log->set_timestamps(timestamps); +} diff --git a/common/log.h b/common/log.h index 1bc5328ce..d13f72d89 100644 --- a/common/log.h +++ b/common/log.h @@ -1,724 +1,90 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "ggml.h" // for ggml_log_level -// -------------------------------- -// -// Basic usage: -// -// -------- -// -// The LOG() and LOG_TEE() macros are ready to go by default -// they do not require any initialization. -// -// LOGLN() and LOG_TEELN() are variants which automatically -// include \n character at the end of the log string. -// -// LOG() behaves exactly like printf, by default writing to a logfile. -// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). -// -// Default logfile is named -// "llama..log" -// Default LOG_TEE() secondary output target is -// stderr -// -// Logs can be dynamically disabled or enabled using functions: -// log_disable() -// and -// log_enable() -// -// A log target can be changed with: -// log_set_target( string ) -// creating and opening, or re-opening a file by string filename -// or -// log_set_target( FILE* ) -// allowing to point at stderr, stdout, or any valid FILE* file handler. -// -// -------- -// -// End of Basic usage. -// -// -------------------------------- - -// Specifies a log target. -// default uses log_handler() with "llama.log" log file -// this can be changed, by defining LOG_TARGET -// like so: -// -// #define LOG_TARGET (a valid FILE*) -// #include "log.h" -// -// or it can be simply redirected to stdout or stderr -// like so: -// -// #define LOG_TARGET stderr -// #include "log.h" -// -// The log target can also be redirected to a different function -// like so: -// -// #define LOG_TARGET log_handler_different() -// #include "log.h" -// -// FILE* log_handler_different() -// { -// return stderr; -// } -// -// or: -// -// #define LOG_TARGET log_handler_another_one("somelog.log") -// #include "log.h" -// -// FILE* log_handler_another_one(char*filename) -// { -// static FILE* logfile = nullptr; -// (...) -// if( !logfile ) -// { -// fopen(...) -// } -// (...) -// return logfile -// } -// -#ifndef LOG_TARGET - #define LOG_TARGET log_handler() -#endif - -#ifndef LOG_TEE_TARGET - #define LOG_TEE_TARGET stderr -#endif - -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - -// Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} - -// Utility function for generating log file names with unique id based on thread id. -// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" -// where the number is a runtime id of the current thread. - -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) - -// INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) -{ - static bool _multilog = false; - - if (multilog != LogTriStateSame) - { - _multilog = multilog == LogTriStateTrue; - } - - std::stringstream buf; - - buf << log_file_basename; - if (_multilog) - { - buf << "."; - buf << log_get_pid(); - } - buf << "."; - buf << log_file_extension; - - return buf.str(); -} - -#ifndef LOG_DEFAULT_FILE_NAME - #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") -#endif - -// Utility for turning #define values into string literals -// so we can have a define for stderr and -// we can print "stderr" instead of literal stderr, etc. -#define LOG_STRINGIZE1(s) #s -#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) - -#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) - -// Allows disabling timestamps. -// in order to disable, define LOG_NO_TIMESTAMPS -// like so: -// -// #define LOG_NO_TIMESTAMPS -// #include "log.h" -// -#ifndef LOG_NO_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif +#ifndef __GNUC__ +# define LOG_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else - #define LOG_TIMESTAMP_FMT "%s" - #define LOG_TIMESTAMP_VAL ,"" +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#ifdef LOG_TEE_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif -#else - #define LOG_TEE_TIMESTAMP_FMT "%s" - #define LOG_TEE_TIMESTAMP_VAL ,"" -#endif +#define LOG_DEFAULT_DEBUG 1 +#define LOG_DEFAULT_LLAMA 0 -// Allows disabling file/line/function prefix -// in order to disable, define LOG_NO_FILE_LINE_FUNCTION -// like so: +// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower +// set via gpt_log_set_verbosity() +extern int gpt_log_verbosity_thold; + +void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe + +// the gpt_log uses an internal worker thread to print/write log messages +// when the worker thread is paused, incoming log messages are discarded +struct gpt_log; + +struct gpt_log * gpt_log_init(); +struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit +void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe +void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe +void gpt_log_free (struct gpt_log * log); + +LOG_ATTRIBUTE_FORMAT(3, 4) +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); + +// defaults: file = NULL, colors = false, prefix = false, timestamps = false // -// #define LOG_NO_FILE_LINE_FUNCTION -// #include "log.h" +// regular log output: // -#ifndef LOG_NO_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_FLF_FMT "%s" - #define LOG_FLF_VAL ,"" -#endif - -#ifdef LOG_TEE_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_TEE_FLF_FMT "%s" - #define LOG_TEE_FLF_VAL ,"" -#endif - -// INTERNAL, DO NOT USE -// USE LOG() INSTEAD +// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// llm_load_tensors: ggml ctx size = 0.27 MiB +// llm_load_tensors: offloading 32 repeating layers to GPU +// llm_load_tensors: offloading non-repeating layers to GPU // -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ +// with prefix = true, timestamps = true, the log output will look like this: +// +// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB +// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU +// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU +// +// I - info (stdout, V = 0) +// W - warning (stderr, V = 0) +// E - error (stderr, V = 0) +// D - debug (stderr, V = LOG_DEFAULT_DEBUG) +// + +void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe +void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe +void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix + +// helper macros for logging +// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold +// +// for example: +// +// LOG_DBG("this is a debug message: %d\n", expensive_function()); +// +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// + +#define LOG_TMPL(level, verbosity, ...) \ + do { \ + if ((verbosity) <= gpt_log_verbosity_thold) { \ + gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + } \ } while (0) -#else - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - } while (0) -#endif -// INTERNAL, DO NOT USE -// USE LOG_TEE() INSTEAD -// -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#else - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#endif +#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) +#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) -// The '\0' as a last argument, is a trick to bypass the silly -// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" -// so we can have a single macro which can be called just like printf. +#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) +#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) +#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) +#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) -// Main LOG macro. -// behaves like printf, and supports arguments the exact same way. -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG(...) LOG_IMPL(__VA_ARGS__, "") -#else - #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// Main TEE macro. -// does the same as LOG -// and -// simultaneously writes stderr. -// -// Secondary target can be changed just like LOG_TARGET -// by defining LOG_TEE_TARGET -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") -#else - #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// LOG macro variants with auto endline. -#if !defined(_MSC_VER) || defined(__clang__) - #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") - #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") -#else - #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n") - #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") -#endif - -// INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) -{ - static bool _initialized = false; - static bool _append = false; - static bool _disabled = filename.empty() && target == nullptr; - static std::string log_current_filename{filename}; - static FILE *log_current_target{target}; - static FILE *logfile = nullptr; - - if (change) - { - if (append != LogTriStateSame) - { - _append = append == LogTriStateTrue; - return logfile; - } - - if (disable == LogTriStateTrue) - { - // Disable primary target - _disabled = true; - } - // If previously disabled, only enable, and keep previous target - else if (disable == LogTriStateFalse) - { - _disabled = false; - } - // Otherwise, process the arguments - else if (log_current_filename != filename || log_current_target != target) - { - _initialized = false; - } - } - - if (_disabled) - { - // Log is disabled - return nullptr; - } - - if (_initialized) - { - // with fallback in case something went wrong - return logfile ? logfile : stderr; - } - - // do the (re)initialization - if (target != nullptr) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - - log_current_filename = LOG_DEFAULT_FILE_NAME; - log_current_target = target; - - logfile = target; - } - else - { - if (log_current_filename != filename) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - } - - logfile = fopen(filename.c_str(), _append ? "a" : "w"); - } - - if (!logfile) - { - // Verify whether the file was opened, otherwise fallback to stderr - logfile = stderr; - - fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); - fflush(stderr); - - // At this point we let the init flag be to true below, and let the target fallback to stderr - // otherwise we would repeatedly fopen() which was already unsuccessful - } - - _initialized = true; - - return logfile ? logfile : stderr; -} - -// INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) -{ - return log_handler1_impl(change, append, disable, filename, target); -} - -// Disables logs entirely at runtime. -// Makes LOG() and LOG_TEE() produce no output, -// until enabled back. -#define log_disable() log_disable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_disable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); -} - -// Enables logs at runtime. -#define log_enable() log_enable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_enable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); -} - -// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) -#define log_set_target(target) log_set_target_impl(target) - -// INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } - -// INTERNAL, DO NOT USE -inline FILE *log_handler() { return log_handler1_impl(); } - -// Enable or disable creating separate log files for each run. -// can ONLY be invoked BEFORE first log use. -#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") -// Enable or disable append mode for log file. -// can ONLY be invoked BEFORE first log use. -#define log_append(enable) log_append_impl(enable) -// INTERNAL, DO NOT USE -inline FILE *log_append_impl(bool enable) -{ - return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); -} - -inline void log_test() -{ - log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n"); - log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); - log_set_target(stderr); - LOG("04 Hello World to stderr!\n"); - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n"); - log_set_target(stdout); - LOG("07 Hello World to stdout!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n"); - log_disable(); - LOG("09 Hello World _1_ into the void!\n"); - log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); - log_disable(); - log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); - log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); - log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n"); - log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n"); -#ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n"); - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); - LOG_TEELN("17 Hello msvc TEELN without arguments\n"); - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); - LOG("19 Hello msvc LOG without arguments\n"); - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); - LOGLN("21 Hello msvc LOGLN without arguments\n"); - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); -#endif -} - -inline bool log_param_single_parse(const std::string & param) -{ - if ( param == "--log-test") - { - log_test(); - return true; - } - - if ( param == "--log-disable") - { - log_disable(); - return true; - } - - if ( param == "--log-enable") - { - log_enable(); - return true; - } - - if (param == "--log-new") - { - log_multilog(true); - return true; - } - - if (param == "--log-append") - { - log_append(true); - return true; - } - - return false; -} - -inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) -{ - if ( param == "--log-file") - { - if (!check_but_dont_parse) - { - log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); - } - - return true; - } - - return false; -} - -inline void log_print_usage() -{ - printf("log options:\n"); - /* format - printf(" -h, --help show this help message and exit\n");*/ - /* spacing - printf("__-param----------------Description\n");*/ - printf(" --log-test Run simple logging test\n"); - printf(" --log-disable Disable trace logs\n"); - printf(" --log-enable Enable trace logs\n"); - printf(" --log-file Specify a log filename (without extension)\n"); - printf(" --log-new Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"\n"); - printf(" --log-append Don't truncate the old log file.\n"); - printf("\n"); -} - -#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) - -// INTERNAL, DO NOT USE -inline void log_dump_cmdline_impl(int argc, char **argv) -{ - std::stringstream buf; - for (int i = 0; i < argc; ++i) - { - if (std::string(argv[i]).find(' ') != std::string::npos) - { - buf << " \"" << argv[i] <<"\""; - } - else - { - buf << " " << argv[i]; - } - } - LOGLN("Cmd:%s", buf.str().c_str()); -} - -#define log_tostr(var) log_var_to_string_impl(var).c_str() - -inline std::string log_var_to_string_impl(bool var) -{ - return var ? "true" : "false"; -} - -inline std::string log_var_to_string_impl(std::string var) -{ - return var; -} - -inline std::string log_var_to_string_impl(const std::vector & var) -{ - std::stringstream buf; - buf << "[ "; - bool first = true; - for (auto e : var) - { - if (first) - { - first = false; - } - else - { - buf << ", "; - } - buf << std::to_string(e); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (const auto & token : tokens) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, token); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "'" << detokenized << "'" - << ":" << std::to_string(token); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (int i = 0; i < batch.n_tokens; ++i) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, batch.token[i]); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); - } - buf << " ]"; - - return buf.str(); -} - -#ifdef LOG_DISABLE_LOGS - -#undef LOG -#define LOG(...) // dummy stub -#undef LOGLN -#define LOGLN(...) // dummy stub - -#undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_DISABLE -#define LOG_DISABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_SET_TARGET -#define LOG_SET_TARGET(...) // dummy stub - -#undef LOG_DUMP_CMDLINE -#define LOG_DUMP_CMDLINE(...) // dummy stub - -#endif // LOG_DISABLE_LOGS +#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) +#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) +#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__) +#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 3ca112ef1..7953c723e 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -2,8 +2,11 @@ #include "common.h" #include "log.h" +#include #include +#include #include +#include void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { diff --git a/common/sampling.cpp b/common/sampling.cpp index 7806b77e0..e51d07611 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -2,6 +2,9 @@ #include "common.h" +#include +#include + // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h template @@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_perf = false; // TODO: control via params + lparams.no_perf = params.no_perf; auto * result = new gpt_sampler { /* .params = */ params, @@ -254,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); } } @@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { + return llama_sampler_get_seed(gsmpl->chain); +} + // helpers llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { @@ -318,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { } std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { - std::string result = "\tlogits "; + std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); @@ -420,7 +427,7 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map { + std::unordered_map sampler_name_map = { { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, diff --git a/common/sampling.h b/common/sampling.h index 654e0c513..d0e1a9203 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -2,61 +2,11 @@ #include "llama.h" +#include "common.h" + #include #include -enum gpt_sampler_type { - GPT_SAMPLER_TYPE_NONE = 0, - GPT_SAMPLER_TYPE_TOP_K = 1, - GPT_SAMPLER_TYPE_TOP_P = 2, - GPT_SAMPLER_TYPE_MIN_P = 3, - GPT_SAMPLER_TYPE_TFS_Z = 4, - GPT_SAMPLER_TYPE_TYPICAL_P = 5, - GPT_SAMPLER_TYPE_TEMPERATURE = 6, -}; - -// sampling parameters -struct gpt_sampler_params { - uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token - bool ignore_eos = false; - - std::vector samplers = { - GPT_SAMPLER_TYPE_TOP_K, - GPT_SAMPLER_TYPE_TFS_Z, - GPT_SAMPLER_TYPE_TYPICAL_P, - GPT_SAMPLER_TYPE_TOP_P, - GPT_SAMPLER_TYPE_MIN_P, - GPT_SAMPLER_TYPE_TEMPERATURE - }; - - std::string grammar; // optional BNF-like grammar to constrain sampling - - std::vector logit_bias; // logit biases to apply - - // print the parameters into a string - std::string print() const; -}; - // gpt_sampler extends llama_sampler with additional functionality: // // - grammar support @@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); + // helpers // access the internal list of current candidate tokens diff --git a/common/train.cpp b/common/train.cpp index fef1e57c9..661ad8382 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1,9 +1,11 @@ #include "train.h" #include "common.h" +#include #include #include #include +#include struct random_normal_distribution { std::mt19937 gen; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2dfc91446..f333a567f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -131,12 +131,14 @@ class Model: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - if len(self.part_names) > 1: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + with open(index_file, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): @@ -144,6 +146,7 @@ class Model: self.tensor_names.update(weight_map.keys()) else: self.tensor_names = tensor_names_from_parts + weight_map = {} for part_name in self.part_names: logger.info(f"gguf: loading model part '{part_name}'") @@ -170,9 +173,17 @@ class Model: data = LazyTorchTensor.from_eager(data) yield name, data - # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}") + else: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: @@ -305,6 +316,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, ) ) or not new_name.endswith(".weight") @@ -627,6 +640,9 @@ class Model: if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-2 + res = "phi-2" if res is None: logger.warning("\n") @@ -1485,7 +1501,7 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1839,6 +1855,60 @@ class MiniCPMModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(Model): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + rope_dims = hparams["qk_rope_head_dim"] + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN @@ -2778,6 +2848,8 @@ class Rwkv6Model(Model): self.gguf_writer.add_tokenizer_model("rwkv") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -2946,6 +3018,66 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("OlmoeForCausalLM") +class OlmoeModel(Model): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ff4955f9c..021f65abd 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -31,6 +31,7 @@ import re import requests import sys import json +import shutil from hashlib import sha256 from enum import IntEnum, auto @@ -97,6 +98,7 @@ models = [ {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, + {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, ] @@ -125,12 +127,27 @@ def download_model(model): if tokt == TOKENIZER_TYPE.UGM: files.append("spiece.model") - for file in files: - save_path = f"models/tokenizers/{name}/{file}" - if os.path.isfile(save_path): - logger.info(f"{name}: File {save_path} already exists - skipping") - continue - download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) + if os.path.isdir(repo): + # If repo is a path on the file system, copy the directory + for file in files: + src_path = os.path.join(repo, file) + dst_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(dst_path): + logger.info(f"{name}: File {dst_path} already exists - skipping") + continue + if os.path.isfile(src_path): + shutil.copy2(src_path, dst_path) + logger.info(f"{name}: Copied {src_path} to {dst_path}") + else: + logger.warning(f"{name}: Source file {src_path} does not exist") + else: + # If repo is a URL, download the files + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) for model in models: diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ef088034c..439a78de1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -367,7 +367,13 @@ if __name__ == '__main__': yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - dest = super().modify_tensors(data_torch, name, bid) + dest = list(super().modify_tensors(data_torch, name, bid)) + # some archs may have the same tensor for lm_head and output (tie word embeddings) + # in this case, adapters targeting lm_head will fail when using llama-export-lora + # therefore, we ignore them for now + # see: https://github.com/ggerganov/llama.cpp/issues/9065 + if name == "lm_head.weight" and len(dest) == 0: + raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() diff --git a/docs/build.md b/docs/build.md index 152d46d6f..faa0ecfa4 100644 --- a/docs/build.md +++ b/docs/build.md @@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to ### Android To read documentation for how to build on Android, [click here](./android.md) + +### Arm CPU optimized mulmat kernels + +Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. + +To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index f3b0c433b..4a15941f1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,47 +1,28 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include -#include #include #include #include -// mutates the input string -static std::vector parse_list(char * p) { - std::vector ret; - - char * q = p; - - while (*p) { - if (*p == ',') { - *p = '\0'; - ret.push_back(std::atoi(q)); - q = p + 1; - } - - ++p; - } - - ret.push_back(std::atoi(q)); - - return ret; -} - static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } + gpt_init(); + int is_pp_shared = params.is_pp_shared; std::vector n_pp = params.n_pp; @@ -98,7 +79,7 @@ int main(int argc, char ** argv) { const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -115,17 +96,17 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } if (!params.batched_bench_output_jsonl) { - LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); - LOG_TEE("\n"); - LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); - LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); + LOG("\n"); + LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG("\n"); + LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); } for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { @@ -155,7 +136,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -177,7 +158,7 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } @@ -195,21 +176,21 @@ int main(int argc, char ** argv) { const float speed = n_kv / t; if(params.batched_bench_output_jsonl) { - LOG_TEE( + LOG( "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed ); } else { - LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); + LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); } } } } - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); llama_batch_free(batch); @@ -218,7 +199,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 4bc2bbf2c..10f2e7fd1 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -140,8 +140,6 @@ while n_cur <= n_len { let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) - llama_sampler_accept(smpl, new_token_id) - // is it an end of stream? -> mark the stream as finished if llama_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 @@ -202,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) -llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) +llama_perf_sampler_print(smpl) +llama_perf_context_print(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index f5f309022..7887a43d6 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,4 +1,6 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -7,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -18,11 +20,11 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } + gpt_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -42,7 +44,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: error: unable to load model\n" , __func__); return 1; } @@ -72,31 +74,29 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); return 1; } const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); @@ -114,7 +114,7 @@ int main(int argc, char ** argv) { if (llama_model_has_encoder(model)) { if (llama_encode(ctx, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -142,7 +142,7 @@ int main(int argc, char ** argv) { //} if (n_parallel > 1) { - LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); } // main loop @@ -172,14 +172,12 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); - llama_sampler_accept(smpl, new_token_id); - // is it an end of generation? -> mark the stream as finished if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; - LOG_TEE("\n"); + LOG("\n"); if (n_parallel > 1) { - LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); + LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); } continue; @@ -187,8 +185,7 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); } streams[i] += llama_token_to_piece(ctx, new_token_id); @@ -210,29 +207,27 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); - if (n_parallel > 1) { - LOG_TEE("\n"); + LOG("\n"); for (int32_t i = 0; i < n_parallel; ++i) { - LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); + LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); } } const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 97622f4f4..922daf528 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -183,7 +183,7 @@ int main(int argc, char ** argv) { ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - TENSOR_DUMP(gf->nodes[0]); + TENSOR_DUMP(ggml_graph_node(gf, 0)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); @@ -224,7 +224,7 @@ int main(int argc, char ** argv) { // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); + float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("=====================================================================================\n"); @@ -252,7 +252,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different - float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); + float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0)); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8ca9f8915..ecff95f9a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_ const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; try { w->token_embedding_table.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wo.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->rms_final_weight.resize(p->dim); - LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); if (shared_weights) { w->wcls = {}; } else { w->wcls.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); } } catch (std::length_error &) { @@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); + LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL } static void print_sample_weights(TransformerWeights *w){ - LOG("----- Quick print of first of the weight vales of all the variables\n"); - LOG("%f\n", w->token_embedding_table[0]); - LOG("%f\n", w->rms_att_weight[0]); - LOG("%f\n", w->rms_ffn_weight[0]); + LOG_INF("----- Quick print of first of the weight vales of all the variables\n"); + LOG_INF("%f\n", w->token_embedding_table[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + LOG_INF("%f\n", w->rms_ffn_weight[0]); - LOG("%f\n", w->wq[0]); - LOG("%f\n", w->wk[0]); - LOG("%f\n", w->wv[0]); - LOG("%f\n", w->wo[0]); - LOG("%f\n", w->w1[0]); - LOG("%f\n", w->w2[0]); - LOG("%f\n", w->w3[0]); - LOG("%f\n", w->rms_att_weight[0]); - if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); + LOG_INF("%f\n", w->wq[0]); + LOG_INF("%f\n", w->wk[0]); + LOG_INF("%f\n", w->wv[0]); + LOG_INF("%f\n", w->wo[0]); + LOG_INF("%f\n", w->w1[0]); + LOG_INF("%f\n", w->w2[0]); + LOG_INF("%f\n", w->w3[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -318,20 +319,20 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); - LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); - LOG("%s: n_embd: %u\n", __func__, params->n_embd); - LOG("%s: n_mult: %u\n", __func__, params->n_mult); - LOG("%s: n_head: %u\n", __func__, params->n_head); - LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - LOG("%s: n_ff: %u\n", __func__, params->n_ff); - LOG("%s: n_layer: %u\n", __func__, params->n_layer); - LOG("%s: n_rot: %u\n", __func__, params->n_rot); + LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd); + LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult); + LOG_INF("%s: n_head: %u\n", __func__, params->n_head); + LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff); + LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer); + LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot); } static void print_tensor_info(const struct ggml_context * ctx) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - LOG("%s: Allocating ", __func__); + LOG_INF("%s: Allocating ", __func__); int64_t total = 1; int i = 0; for (; i < ggml_n_dims(t); ++i) { @@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { - LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); + LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam gguf_free(ctx); } else { // assume llama2.c vocabulary - LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); + LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -871,23 +872,25 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { + gpt_init(); + struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { return 1; } - log_set_target(stdout); + Config config; TransformerWeights weights = {}; { - LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); FILE * file = fopen(params.fn_llama2c_model, "rb"); if (!file) { - LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); return 1; } // read in the config header if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); return 1; } auto shared_weights = config.vocab_size > 0; @@ -896,7 +899,7 @@ int main(int argc, char ** argv) { // read in the Transformer weights alloc_weights(&weights, &config, shared_weights); if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); return 1; } fclose(file); @@ -929,7 +932,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 0795175a1..41bf4eb2a 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include "ggml.h" @@ -12,14 +13,15 @@ #include "ggml-metal.h" #endif +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include -#include ////////////////////////////////////////////////// @@ -388,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 6ec3141af..a969c486d 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -12,12 +12,9 @@ #include #include +#include #include -#include #include -#include -#include -#include #define DEBUG_POS 5 @@ -229,8 +226,8 @@ static ggml_status compute_piter( result.eigenvectors.resize(params.n_batch); result.distances.resize(params.n_batch); // get output nodes - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + auto node = ggml_graph_node(gf, i); int iter = -1; // find b_tensor (without copying data from device) if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 630f7c1c7..a438dcb5a 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,4 +1,6 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -38,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { // encoder-only model if (llama_encode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to encode\n", __func__); + LOG_ERR("%s : failed to encode\n", __func__); } } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { // decoder-only model if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); } } @@ -79,19 +81,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { return 1; } + gpt_init(); + params.embedding = true; // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; - print_build_info(); - - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); @@ -101,7 +100,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } @@ -111,19 +110,19 @@ int main(int argc, char ** argv) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); + LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } // split the prompt into lines @@ -138,7 +137,7 @@ int main(int argc, char ** argv) { for (const auto & prompt : prompts) { auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -149,20 +148,20 @@ int main(int argc, char ** argv) { // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { if (inp.empty() || inp.back() != llama_token_sep(model)) { - fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); - fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); + LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); + LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } } // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); } - fprintf(stderr, "\n\n"); + LOG("\n\n"); } } @@ -213,57 +212,57 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); if (params.embd_out.empty()) { - fprintf(stdout, "\n"); + LOG("\n"); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { for (int j = 0; j < n_embd_count; j++) { - fprintf(stdout, "embedding %d: ", j); + LOG("embedding %d: ", j); for (int i = 0; i < std::min(3, n_embd); i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, " ... "); + LOG(" ... "); for (int i = n_embd - 3; i < n_embd; i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, "\n"); + LOG("\n"); } } else { // print the first part of the embeddings or for a single prompt, the full embedding for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); + LOG("embedding %d: ", j); for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, "\n"); + LOG("\n"); } // print cosine similarity matrix if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); + LOG("\n"); + LOG("cosine similarity matrix:\n\n"); for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); + LOG("%6.6s ", prompts[i].c_str()); } - fprintf(stdout, "\n"); + LOG("\n"); for (int i = 0; i < n_prompts; i++) { for (int j = 0; j < n_prompts; j++) { float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + LOG("%6.2f ", sim); } - fprintf(stdout, "%1.10s", prompts[i].c_str()); - fprintf(stdout, "\n"); + LOG("%1.10s", prompts[i].c_str()); + LOG("\n"); } } } @@ -272,43 +271,43 @@ int main(int argc, char ** argv) { if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { const bool notArray = params.embd_out != "array"; - fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); for (int j = 0;;) { // at least one iteration (one prompt) - if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); - fprintf(stdout, "["); + if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); + LOG("["); for (int i = 0;;) { // at least one iteration (n_embd > 0) - fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); i++; - if (i < n_embd) fprintf(stdout, ","); else break; + if (i < n_embd) LOG(","); else break; } - fprintf(stdout, notArray ? "]\n }" : "]"); + LOG(notArray ? "]\n }" : "]"); j++; - if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; + if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break; } - fprintf(stdout, notArray ? "\n ]" : "]\n"); + LOG(notArray ? "\n ]" : "]\n"); if (params.embd_out == "json+" && n_prompts > 1) { - fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + LOG(",\n \"cosineSimilarity\": [\n"); for (int i = 0;;) { // at least two iteration (n_embd_count > 1) - fprintf(stdout, " ["); + LOG(" ["); for (int j = 0;;) { // at least two iteration (n_embd_count > 1) float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f", sim); + LOG("%6.2f", sim); j++; - if (j < n_embd_count) fprintf(stdout, ", "); else break; + if (j < n_embd_count) LOG(", "); else break; } - fprintf(stdout, " ]"); + LOG(" ]"); i++; - if (i < n_embd_count) fprintf(stdout, ",\n"); else break; + if (i < n_embd_count) LOG(",\n"); else break; } - fprintf(stdout, "\n ]"); + LOG("\n ]"); } - if (notArray) fprintf(stdout, "\n}\n"); + if (notArray) LOG("\n}\n"); } - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 881111ffd..6d629fe4e 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,11 +1,11 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include "ggml.h" #include -#include #include -#include #include /** @@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf(" [\n"); + LOG(" [\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i2 = ne[2] - n; } - printf(" [\n"); + LOG(" [\n"); for (int64_t i1 = 0; i1 < ne[1]; i1++) { if (i1 == n && ne[1] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i1 = ne[1] - n; } - printf(" ["); + LOG(" ["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { if (i0 == n && ne[0] > 2*n) { - printf("..., "); + LOG("..., "); i0 = ne[0] - n; } size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; @@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else { GGML_ABORT("fatal error"); } - printf("%12.4f", v); + LOG("%12.4f", v); sum += v; - if (i0 < ne[0] - 1) printf(", "); + if (i0 < ne[0] - 1) LOG(", "); } - printf("],\n"); + LOG("],\n"); } - printf(" ],\n"); + LOG(" ],\n"); } - printf(" ]\n"); - printf(" sum = %f\n", sum); + LOG(" ]\n"); + LOG(" sum = %f\n", sum); } } @@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); } - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); + LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); // copy the data from the GPU memory if needed @@ -132,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) { std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -144,12 +144,11 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - print_build_info(); + gpt_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -166,14 +165,15 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + LOG_ERR("%s : failed to init\n", __func__); return 1; } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } bool OK = run(ctx, params); @@ -181,8 +181,8 @@ int main(int argc, char ** argv) { return 1; } - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 544e7fff6..0051a5eb6 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "ggml.h" #include "ggml-alloc.h" @@ -369,7 +370,7 @@ struct lora_merge_ctx { // write data to output file { - auto result = gf->nodes[gf->n_nodes - 1]; + auto * result = ggml_graph_node(gf, -1); size_t len = ggml_nbytes(result); if (read_buf.size() < len) { read_buf.resize(len); @@ -401,12 +402,11 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } - g_verbose = (params.verbosity == 1); + g_verbose = (params.verbosity > 1); try { lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); ctx.run_merge(); diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 8b1dafd63..b6d4725fd 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include @@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); gpt_params params; - auto options = gpt_params_parser_init(params, ex); + auto ctx_arg = gpt_params_parser_init(params, ex); file << "| Argument | Explanation |\n"; file << "| -------- | ----------- |\n"; - for (auto & opt : options) { + for (auto & opt : ctx_arg.options) { file << "| `"; // args for (const auto & arg : opt.args) { diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 881f0451c..82c239b83 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p throw std::invalid_argument("error: invalid parameter for argument: " + arg); } - if (argc - arg_idx < 2) { + if (argc - arg_idx != 2) { throw std::invalid_argument("error: bad arguments"); } @@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) { int n_split = 1; int total_tensors = 0; - auto * ctx_out = gguf_init_empty(); + // avoid overwriting existing output file + if (std::ifstream(split_params.output.c_str())) { + fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); + exit(EXIT_FAILURE); + } + std::ofstream fout(split_params.output.c_str(), std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors + auto * ctx_out = gguf_init_empty(); + std::vector read_data; std::vector ctx_metas; std::vector ctx_ggufs; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index e1efbf573..20b99a4fd 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_decode(ctx, bat); llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1); - llama_sampler_accept(smpl, token); if (token == eos_token) { break; @@ -154,11 +154,12 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } + gpt_init(); + llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_context_params cparams = llama_context_params_from_gpt_params(params); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 15a3f0d14..265281699 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,4 +1,6 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -18,12 +20,12 @@ #endif static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" + LOG("\nexample usage:\n"); + LOG("\n %s \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); - LOG_TEE("\n"); + LOG("\n"); } struct Stats { @@ -124,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0]*n_as, 0); } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); exit(1); //GGML_ABORT("fatal error"); } - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); - } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); // loop over all possible experts, regardless if they are used or not in the batch for (int ex = 0; ex < n_as; ++ex) { size_t e_start = ex*src1->ne[0]; @@ -150,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; if (!std::isfinite(e.values[e_start + j])) { - fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); + LOG("\n"); + LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str()); exit(1); } } @@ -173,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; e.counts[j]++; if (!std::isfinite(e.values[j])) { - fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); + LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str()); exit(1); } } @@ -238,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (n_zeros != 0 && is_first) { - fprintf(stderr, "\n"); + LOG_INF("\n"); is_first = false; } if (n_zeros == n_all) { - fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); + LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); continue; } if (n_zeros > 0) { - fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); continue; } @@ -257,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (to_store.size() < m_stats.size()) { - fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); + LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); } std::ofstream out(fname, std::ios::binary); @@ -289,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const { out.write(m_params.prompt_file.c_str(), len); } - if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); - } + LOGV(1, "\n"); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } bool IMatrixCollector::load_imatrix(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__, fname); + LOG_ERR("%s: failed to open %s\n",__func__, fname); return false; } int n_entries; in.read((char*)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, fname); + LOG_ERR("%s: no data in file %s\n", __func__, fname); return false; } for (int i = 0; i < n_entries; ++i) { @@ -311,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); + LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); return false; } name_as_vec[len] = 0; @@ -322,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -335,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector tmp(nval); in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading data for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -436,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { const int n_ctx = llama_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (params.i_chunk > 0) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { - fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); + LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); return false; } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); + LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); } if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size()); return false; } @@ -477,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { double nll = 0.0; double nll2 = 0.0; - fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -513,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // TODO: use batch.logits to save computations instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -530,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } if (params.compute_ppl) { const int first = n_ctx/2; - const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += n_ctx - first - 1; - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); fflush(stdout); logits.clear(); } } - printf("\n"); + LOG("\n"); if (params.compute_ppl) { nll2 /= count; @@ -561,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + LOG("Unexpected negative standard deviation of log(prob)\n"); } } @@ -575,27 +572,27 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - params.verbosity = 1; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } + gpt_init(); + params.n_batch = std::min(params.n_batch, params.n_ctx); g_collector.set_params(params); for (const auto & in_file : params.in_files) { - printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); + LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); if (!g_collector.load_imatrix(in_file.c_str())) { - fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); + LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str()); return 1; } } if (params.in_files.size() > 1) { - printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); g_collector.save_imatrix(); } @@ -614,20 +611,20 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + LOG_ERR("%s : failed to init\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } if (!compute_imatrix(ctx, params)) { @@ -636,8 +633,8 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 87abb761f..b77b876cc 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -1,6 +1,8 @@ +#include "arg.h" #include "common.h" - #include "console.h" +#include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -54,7 +56,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -63,7 +65,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -92,7 +94,7 @@ static void sigint_handler(int signo) { is_interacting = true; } else { console::cleanup(); - printf("\n"); + LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); @@ -105,63 +107,55 @@ int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { return 1; } - auto & sparams = params.sparams; + gpt_init(); -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("infill", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS + auto & sparams = params.sparams; console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } + if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { - printf("\n************\n"); - printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - print_build_info(); - - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - - LOG("%s: llama backend init\n", __func__); + LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -174,34 +168,32 @@ int main(int argc, char ** argv) { g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; ctx = llama_init.context; if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); + LOG_DBG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } const bool add_bos = llama_add_bos_token(model); GGML_ASSERT(!llama_add_eos_token(model)); - LOG("add_bos: %d\n", add_bos); std::vector embd_inp; std::vector embd_end; @@ -226,18 +218,19 @@ int main(int argc, char ** argv) { embd_inp.push_back(middle_token); } - LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); - LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_DBG("add_bos: %d\n", add_bos); + LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str()); + LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); // Should not run without any tokens if (embd_inp.empty()) { embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -246,9 +239,8 @@ int main(int argc, char ** argv) { params.n_keep = (int)embd_inp.size(); } - LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); - LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); - + LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str()); + LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str()); // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -256,21 +248,21 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("\n"); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } if (params.interactive) { @@ -287,25 +279,30 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG_INF("%s: interactive mode on.\n", __func__); if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG_INF("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - LOG_TEE("\n\n"); + smpl = gpt_sampler_init(model, sparams); - LOG_TEE("\n##### Infill mode #####\n\n"); + LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + LOG("\n"); + LOG("\n##### Infill mode #####\n\n"); if (params.interactive) { const char *control_message; if (params.multiline_input) { @@ -316,11 +313,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -340,8 +337,6 @@ int main(int argc, char ** argv) { std::vector embd; - smpl = gpt_sampler_init(model, sparams); - while (n_remain != 0 || params.interactive) { // predict if (!embd.empty()) { @@ -355,9 +350,8 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); - fflush(stdout); } // infinite text generation via context swapping @@ -366,14 +360,14 @@ int main(int argc, char ** argv) { // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() > n_ctx) { if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); @@ -381,9 +375,9 @@ int main(int argc, char ** argv) { n_past -= n_discard; - LOG("after swap: n_past = %d\n", n_past); + LOG_DBG("after swap: n_past = %d\n", n_past); - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); } @@ -395,16 +389,16 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); } } @@ -416,7 +410,7 @@ int main(int argc, char ** argv) { gpt_sampler_accept(smpl, id, true); - // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -426,10 +420,10 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); @@ -448,7 +442,7 @@ int main(int argc, char ** argv) { if (input_echo) { for (auto id : embd) { const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); if (embd.size() > 1) { input_tokens.push_back(id); @@ -457,7 +451,6 @@ int main(int argc, char ** argv) { output_ss << token_str; } } - fflush(stdout); } // reset color to default if we there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { @@ -470,10 +463,9 @@ int main(int argc, char ** argv) { if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } - fflush(stdout); - printf("\n"); + LOG("\n"); console::set_display(console::user_input); std::string buffer; std::string line; @@ -529,35 +521,33 @@ int main(int argc, char ** argv) { n_remain = params.n_predict; n_past = 0; n_consumed = 0; - // LOG_TEE("took new input\n"); is_interacting = false; } // deal with end of generation tokens in interactive mode else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { - LOG("found EOS token\n"); + LOG_DBG("found EOS token\n"); if (params.interactive) { is_interacting = true; - printf("\n"); + LOG("\n"); console::set_display(console::user_input); - fflush(stdout); } } if (n_past > 0 && is_interacting && !params.interactive) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); + LOG_DBG("adding input prefix BOS token\n"); embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty()) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); buffer += params.input_prefix; - printf("%s", buffer.c_str()); + LOG("%s", buffer.c_str()); } std::string line; @@ -575,17 +565,17 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty()) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); buffer += params.input_suffix; - printf("%s", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); const auto line_inp = ::llama_tokenize(ctx, buffer, false); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); @@ -596,9 +586,9 @@ int main(int argc, char ** argv) { } n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -625,11 +615,10 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } - LOG_TEE("\n"); + LOG("\n"); gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); @@ -639,9 +628,5 @@ int main(int argc, char ** argv) { gpt_sampler_free(smpl); llama_backend_free(); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - return 0; } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d7db5af72..2d90f65a0 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 06ec160c2..f611809c6 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( // sample the most likely token const auto new_token_id = llama_sampler_sample(sampler, context, -1); - llama_sampler_accept(sampler, new_token_id); - const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { return nullptr; diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 92f61fe83..dcd9803a2 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -152,8 +152,6 @@ actor LlamaContext { new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) - llama_sampler_accept(sampling, new_token_id) - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") is_done = true diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 06a65fba4..4f783f3ce 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert_image_encoder_to_gguf \ +python ./examples/llava/convert_image_encoder_to_gguf.py \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \ --output-dir path/to/MobileVLM-1.7B \ @@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \ ``` ```sh -python ./examples/llava/convert_image_encoder_to_gguf \ +python ./examples/llava/convert_image_encoder_to_gguf.py \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --output-dir path/to/MobileVLM-1.7B_V2 \ @@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \ 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B +python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown ``` -5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` +5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k` ```sh -./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ``` Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9b890571e..8aa7b0750 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,7 +3,6 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" -#include "log.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -40,6 +39,11 @@ #include #include +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) + //#define CLIP_DEBUG_FUNCTIONS // RGB uint8 image @@ -165,7 +169,7 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - LOG_TEE("key %s not found in file\n", key); + LOG_ERR("key %s not found in file\n", key); throw std::runtime_error(format("Missing required key: %s", key)); } @@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); } @@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -568,7 +572,7 @@ struct clip_ctx { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; } @@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (load_image_size == nullptr) { load_image_size = clip_image_size_init(); } - LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); image_size_width = load_image_size->width; image_size_height = load_image_size->height; if (is_inf) { @@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const int idx_name = gguf_find_key(ctx, KEY_NAME); if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug const std::string name = gguf_get_val_str(ctx, idx_name); - LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); } - LOG_TEE("%s: description: %s\n", __func__, description.c_str()); - LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); - LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); - LOG_TEE("\n"); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, n_kv); + LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_INF("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); // kv const int n_kv = gguf_get_n_kv(ctx); - LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", __func__, n_kv, n_tensors, fname); { std::map n_type; @@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { n_type[type]++; } - LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } replace_all(value, "\n", "\\n"); - LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { continue; } - LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { size_t tensor_size = ggml_nbytes(cur); model_size += tensor_size; if (verbosity >= 3) { - LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } @@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { #ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); - LOG_TEE("%s: CLIP using CUDA backend\n", __func__); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); #endif #ifdef GGML_USE_METAL new_clip->backend = ggml_backend_metal_init(); - LOG_TEE("%s: CLIP using Metal backend\n", __func__); + LOG_INF("%s: CLIP using Metal backend\n", __func__); #endif #ifdef GGML_USE_CANN new_clip->backend = ggml_backend_cann_init(0); - LOG_TEE("%s: CLIP using CANN backend\n", __func__); + LOG_INF("%s: CLIP using CANN backend\n", __func__); #endif #ifdef GGML_USE_VULKAN new_clip->backend = ggml_backend_vk_init(0); - LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); - LOG_TEE("%s: CLIP using CPU backend\n", __func__); + LOG_INF("%s: CLIP using CPU backend\n", __func__); } // model size and capabilities @@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->use_gelu = gguf_get_val_bool(ctx, idx); if (verbosity >= 1) { - LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); - LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_data = ggml_init(params); if (!new_clip->ctx_data) { - LOG_TEE("%s: ggml_init() failed\n", __func__); + LOG_ERR("%s: ggml_init() failed\n", __func__); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - LOG_TEE("cannot open model file for loading tensors\n"); + LOG_ERR("cannot open model file for loading tensors\n"); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { - LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); + LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 2) { - LOG_TEE("\n%s: vision model hparams\n", __func__); - LOG_TEE("image_size %d\n", hparams.image_size); - LOG_TEE("patch_size %d\n", hparams.patch_size); - LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); - LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); - LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); - LOG_TEE("v_n_head %d\n", hparams.n_head); - LOG_TEE("v_n_layer %d\n", hparams.n_layer); - LOG_TEE("v_eps %f\n", hparams.eps); - LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - LOG_TEE("v_image_grid_pinpoints: "); + LOG_INF("\n%s: vision model hparams\n", __func__); + LOG_INF("image_size %d\n", hparams.image_size); + LOG_INF("patch_size %d\n", hparams.patch_size); + LOG_INF("v_hidden_size %d\n", hparams.hidden_size); + LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_INF("v_projection_dim %d\n", hparams.projection_dim); + LOG_INF("v_n_head %d\n", hparams.n_head); + LOG_INF("v_n_layer %d\n", hparams.n_layer); + LOG_INF("v_eps %f\n", hparams.eps); + LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_INF("v_image_grid_pinpoints: "); for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); + LOG_INF("%d ", hparams.image_grid_pinpoints[i]); } - LOG_TEE("\n"); - LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + LOG_INF("\n"); + LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } @@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_TEE("%s: failed to load vision model tensors\n", __func__); + LOG_ERR("%s: failed to load vision model tensors\n", __func__); } // LLaVA projection @@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); + // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; @@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; auto * data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); + LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length int nx, ny, nc; auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to decode image bytes\n", __func__); + LOG_ERR("%s: failed to decode image bytes\n", __func__); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1754,7 +1758,7 @@ static std::pair select_best_resolution(const std::pair & or int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -1872,7 +1876,7 @@ static std::vector> uhd_slice_image(const clip_imag const int multiple = fmin(ceil(ratio), max_slice_nums); std::vector> images; - LOG_TEE("%s: multiple %d\n", __func__, multiple); + LOG_INF("%s: multiple %d\n", __func__, multiple); images.push_back(std::vector()); if (multiple <= 1) { @@ -1887,17 +1891,17 @@ static std::vector> uhd_slice_image(const clip_imag clip_image_u8 * source_image = clip_image_u8_init(); bicubic_resize(*img, *source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); images[images.size()-1].push_back(source_image); std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); clip_image_u8 * refine_image = clip_image_u8_init(); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); // split_to_patches int width = refine_image->nx; @@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli int idx = 0; for (size_t i = 0; i < imgs.size(); ++i) { for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); clip_image_f32 * res = clip_image_f32_init(); normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); res_imgs->data[idx++] = *res; @@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bool pad_to_square = true; if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } auto & params = ctx->vision_model.hparams; @@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } for (size_t i = 0; i < patches.size(); i++) { - // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } @@ -2279,7 +2283,7 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2449,7 +2453,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); @@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i new_type = type; if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); } const size_t n_elms = ggml_nelements(cur); float * f32_data; @@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i f32_data = (float *)conv_buf.data(); break; default: - LOG_TEE("Please use an input file in f32 or f16\n"); + LOG_ERR("Please use an input file in f32 or f16\n"); gguf_free(ctx_out); return false; } @@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } @@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); } return true; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 5845d0106..8f437863f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -1,14 +1,16 @@ -#include "ggml.h" +#include "arg.h" +#include "base64.hpp" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" - -#include "base64.hpp" +#include "ggml.h" #include #include +#include #include static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { @@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - LOG_TEE("%s: could not load image from base64 string.\n", __func__); + LOG_ERR("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -113,9 +115,9 @@ struct llava_context { }; static void print_usage(int, char ** argv) { - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { @@ -125,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - LOG_TEE("using base64 encoded image instead of command line image path\n"); + LOG_INF("using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { - LOG_TEE("%s: can't load image from prompt\n", __func__); + LOG_ERR("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); @@ -155,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -176,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -187,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - LOG_TEE("\n"); + LOG("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); } @@ -201,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + LOG("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -210,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } gpt_sampler_free(smpl); - printf("\n"); + LOG("\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -221,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -244,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; @@ -267,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - int main(int argc, char ** argv) { ggml_time_init(); gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } - auto model = llava_init(¶ms); + + auto * model = llava_init(¶ms); if (model == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } if (prompt_contains_image(params.prompt)) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, ""); + auto * image_embed = load_image(ctx_llava, ¶ms, ""); // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); } else { for (auto & image : params.image) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, image); + auto * image_embed = load_image(ctx_llava, ¶ms, image); if (!image_embed) { - std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); return 1; } // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 851af0f00..8558c6bdc 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -1,13 +1,23 @@ #include "clip.h" -#include "common.h" -#include "llama.h" #include "llava.h" -#include "base64.hpp" +#include "llama.h" + +#include +#include #include #include +#include +#include #include -#include + +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) // RGB uint8 image struct clip_image_u8 { @@ -54,7 +64,7 @@ static std::pair select_best_resolution(const std::pair& ori int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); ggml_build_forward_expand(gf, flatten); ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor* result = ggml_graph_node(gf, -1); memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): @@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - LOG_TEE("%s: unable to preprocess image\n", __func__); + LOG_ERR("%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; } @@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli load_image_size->width = img->nx; load_image_size->height = img->ny; clip_add_load_image_size(ctx_clip, load_image_size); - LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding @@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 delete[] img_res_v.data; if (!encoded) { - LOG_TEE("Unable to encode image\n"); + LOG_ERR("Unable to encode image\n"); return false; } @@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); @@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); } - LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); return true; } @@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); return false; } return true; @@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co } float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { - LOG_TEE("Unable to allocate memory for image embeddings\n"); + LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; } int n_img_pos; if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - LOG_TEE("%s: cannot encode image, aborting\n", __func__); + LOG_ERR("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } @@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ } llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; if (llama_decode(ctx_llama, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } *n_past += n_eval; @@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); - LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__); return NULL; } @@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_TEE("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: coulnd't embed the image\n", __func__); return NULL; } @@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { auto file = fopen(path, "rb"); if (file == NULL) { - LOG_TEE("%s: can't read file %s\n", __func__, path); + LOG_ERR("%s: can't read file %s\n", __func__, path); return false; } @@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data if (buffer == NULL) { - LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); perror("Memory allocation error"); fclose(file); return false; @@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, image_path); + LOG_ERR("%s: failed to load %s\n", __func__, image_path); return NULL; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 57e7d42c5..c5156c35b 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -1,13 +1,18 @@ -#include "ggml.h" +#include "arg.h" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" +#include "ggml.h" +#include #include #include +#include #include +#include // TODO: remove me struct llava_context { struct clip_ctx * ctx_clip = NULL; @@ -16,14 +21,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); + LOG("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -34,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -49,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); ctx_params.n_ctx = 2048; } else { ctx_params.n_ctx = params->n_ctx; @@ -58,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->model = model; @@ -87,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); return ctx_clip; } @@ -99,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vectorctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); slice_embed->embed = image_embed; slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); @@ -141,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e else if (has_minicpmv_projector == 3) { system_prompt = "<|im_start|>user\n"; } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); @@ -160,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e } eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); } static const char * sample(struct gpt_sampler * smpl, @@ -179,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl, } static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto ctx_clip = clip_init_context(params); - auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); + auto * ctx_clip = clip_init_context(params); + auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { - std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str()); return NULL; } // process the prompt if (params->prompt.empty() && params->interactive == false) { - LOG_TEE("prompt should be given or interactive mode should be on"); + LOG_ERR("prompt should be given or interactive mode should be on"); return NULL; } - auto model = llava_init(params); + auto * model = llava_init(params); if (model == NULL) { fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); return NULL; } const int64_t t_llava_init_start_us = ggml_time_us(); - auto ctx_llava = llava_init_context(params, model); + auto * ctx_llava = llava_init_context(params, model); ctx_llava->ctx_clip = ctx_clip; const int64_t t_llava_init_end_us = ggml_time_us(); float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); const int64_t t_process_image_start_us = ggml_time_us(); process_image(ctx_llava, embeds, params, n_past); const int64_t t_process_image_end_us = ggml_time_us(); float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); llava_image_embed_free(embeds); return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -236,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par // generate the response - LOG_TEE("\n"); + LOG_INF("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); return smpl; @@ -253,17 +252,11 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -272,21 +265,23 @@ int main(int argc, char ** argv) { for (auto & image : params.image) { int n_past = 0; - auto ctx_llava = minicpmv_init(¶ms, image, n_past); + auto * ctx_llava = minicpmv_init(¶ms, image, n_past); if (!params.prompt.empty()) { - LOG_TEE("%s\n", params.prompt.c_str()); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + LOG("%s\n", params.prompt.c_str()); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0){ - if(!have_tmp)continue; - else break; + if (!have_tmp) { + continue; + } + break; } if (strstr(tmp, "###")) break; // Yi-VL behavior have_tmp = true; @@ -298,15 +293,15 @@ int main(int argc, char ** argv) { gpt_sampler_free(smpl); }else { while (true) { - LOG_TEE(""); + LOG(""); std::string prompt; std::getline(std::cin, prompt); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior @@ -318,7 +313,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 5027a483a..49870b4a4 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,4 +1,7 @@ +#include "arg.h" #include "common.h" +#include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -36,23 +39,18 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } + gpt_init(); + const int W = 15; // lookahead window const int N = 5; // n-gram size const int G = 15; // max verification n-grams const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookahead", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -74,14 +72,14 @@ int main(int argc, char ** argv) { const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -165,7 +163,7 @@ int main(int argc, char ** argv) { { const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); fflush(stdout); } } @@ -255,7 +253,7 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); + LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__); return 1; } @@ -292,10 +290,10 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id); if (v == 0) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } else { // print light cyan - printf("\033[0;96m%s\033[0m", token_str.c_str()); + LOG("\033[0;96m%s\033[0m", token_str.c_str()); } fflush(stdout); @@ -329,21 +327,21 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { - printf(" - ngram %2d: ", i); + LOG(" - ngram %2d: ", i); const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } - printf("\n"); + LOG("\n"); } } @@ -454,20 +452,20 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("W = %2d\n", W); - LOG_TEE("N = %2d\n", N); - LOG_TEE("G = %2d\n", G); - LOG_TEE("\n"); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_accept = %d\n", n_accept); + LOG_INF("\n"); + LOG_INF("W = %2d\n", W); + LOG_INF("N = %2d\n", N); + LOG_INF("G = %2d\n", G); + LOG_INF("\n"); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_accept = %d\n", n_accept); - LOG_TEE("\n"); + LOG_INF("\n"); gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -481,7 +479,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 795b06c88..33287c02c 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -1,7 +1,8 @@ -#include "ggml.h" -#include "llama.h" +#include "arg.h" #include "common.h" #include "ngram-cache.h" +#include "ggml.h" +#include "llama.h" #include #include @@ -13,8 +14,7 @@ int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -40,4 +40,6 @@ int main(int argc, char ** argv){ fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); + + return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 93299ef8b..6d1e1ceb9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -1,25 +1,26 @@ -#include "ggml.h" +#include "arg.h" #include "common.h" -#include "llama.h" #include "log.h" #include "ngram-cache.h" +#include "llama.h" +#include "ggml.h" -#include #include #include +#include #include #include #include -#include int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } + gpt_init(); + const int n_draft = params.n_draft; // init llama.cpp @@ -49,7 +50,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -128,7 +129,7 @@ int main(int argc, char ** argv){ const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; - LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); } // After each chunk, update the dynamic ngram cache with the context ngram cache: @@ -136,24 +137,24 @@ int main(int argc, char ** argv){ ngram_cache_context.clear(); } - LOG_TEE("\n"); + LOG("\n"); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); llama_free(ctx); llama_free_model(model); llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 9ac7f6b47..2ccd0e6c1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,7 +1,10 @@ +#include "arg.h" #include "ggml.h" -#include "llama.h" #include "common.h" #include "ngram-cache.h" +#include "sampling.h" +#include "log.h" +#include "llama.h" #include #include @@ -12,22 +15,17 @@ int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } + gpt_init(); + // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookup", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -57,7 +55,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -75,14 +73,14 @@ int main(int argc, char ** argv){ const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -123,7 +121,7 @@ int main(int argc, char ** argv){ } // print current draft sequence - LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); + LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str()); int i_dft = 0; while (true) { @@ -135,7 +133,7 @@ int main(int argc, char ** argv){ const std::string token_str = llama_token_to_piece(ctx, id); if (!params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } if (llama_token_is_eog(model, id)) { @@ -146,7 +144,7 @@ int main(int argc, char ** argv){ // check if the target token matches the draft if (i_dft < (int) draft.size() && id == draft[i_dft]) { - LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); + LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); ++n_accept; ++n_past; ++i_dft; @@ -160,19 +158,19 @@ int main(int argc, char ** argv){ if (params.use_color) { // color accepted draft token - printf("\033[34m%s\033[0m", token_str.c_str()); + LOG("\033[34m%s\033[0m", token_str.c_str()); fflush(stdout); } continue; } if (params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } fflush(stdout); - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); + LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); draft.clear(); draft.push_back(id); @@ -223,24 +221,23 @@ int main(int argc, char ** argv){ llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ntarget:\n\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG_INF("\ntarget:\n\n"); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -251,7 +248,7 @@ int main(int argc, char ** argv){ llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/main/README.md b/examples/main/README.md index 9396a34fa..6730effdf 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. +The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full. + It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. ### Temperature diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ef2158842..91fea9326 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,11 +1,11 @@ +#include "arg.h" #include "common.h" - #include "console.h" +#include "log.h" +#include "sampling.h" #include "llama.h" #include -#include -#include #include #include #include @@ -41,11 +41,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -static void print_usage(int, char ** argv) { - printf("\nexample usage:\n"); - printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); - printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); - printf("\n"); +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); } static bool file_exists(const std::string & path) { @@ -73,8 +75,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); + LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -82,7 +83,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -112,7 +113,7 @@ static void sigint_handler(int signo) { need_insert_eot = true; } else { console::cleanup(); - printf("\n"); + LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); @@ -121,80 +122,61 @@ static void sigint_handler(int signo) { } #endif -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { llama_chat_msg new_msg{role, content}; auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); - LOG("formatted: %s\n", formatted.c_str()); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; } int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage); - - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { return 1; } + gpt_init(); + auto & sparams = params.sparams; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("main", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - // TODO: Dump params ? - //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); - // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - print_build_info(); + LOG_INF("%s: llama backend init\n", __func__); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -209,21 +191,19 @@ int main(int argc, char ** argv) { g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; ctx = llama_init.context; if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } - LOG("%s: llama threadpool init = n_threads = %d\n", - __func__, - (int) params.cpuparams.n_threads - ); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -235,8 +215,8 @@ int main(int argc, char ** argv) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { threadpool_batch = ggml_threadpool_new(&tpp_batch); if (!threadpool_batch) { - LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); - exit(1); + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; } // Start the non-batch threadpool in the paused state @@ -245,55 +225,54 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { - LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; } llama_attach_threadpool(ctx, threadpool, threadpool_batch); const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print chat template example in conversation mode if (params.conversation) { if (params.enable_chat_template) { - LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); } else { - LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } std::string path_session = params.path_prompt_cache; std::vector session_tokens; if (!path_session.empty()) { - LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); if (!file_exists(path_session)) { - LOG_TEE("%s: session file does not exist, will create.\n", __func__); + LOG_INF("%s: session file does not exist, will create.\n", __func__); } else if (file_is_empty(path_session)) { - LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); - LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } @@ -301,7 +280,8 @@ int main(int argc, char ** argv) { if (!llama_model_has_encoder(model)) { GGML_ASSERT(!llama_add_eos_token(model)); } - LOG("add_bos: %d\n", add_bos); + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); std::vector embd_inp; @@ -310,31 +290,31 @@ int main(int argc, char ** argv) { ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); + LOG_DBG("tokenize the prompt\n"); embd_inp = ::llama_tokenize(ctx, prompt, true, true); } else { - LOG("use session tokens\n"); + LOG_DBG("use session tokens\n"); embd_inp = session_tokens; } - LOG("prompt: \"%s\"\n", log_tostr(prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); } // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } else { - LOG_TEE("error: input is empty\n"); + LOG_ERR("input is empty\n"); return -1; } } // Tokenize negative prompt if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -348,29 +328,28 @@ int main(int argc, char ** argv) { n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_TEE("%s: using full prompt from session file\n", __func__); + LOG_INF("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + LOG_INF("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); } else { - LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); } // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } - LOGLN( - "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu", - log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1); } @@ -392,21 +371,20 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } // ctrl+C handling @@ -426,40 +404,40 @@ int main(int argc, char ** argv) { } if (params.interactive) { - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { - LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + LOG("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } } if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -467,13 +445,15 @@ int main(int argc, char ** argv) { smpl = gpt_sampler_init(model, sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + return 1; } - LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); - LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -487,9 +467,9 @@ int main(int argc, char ** argv) { GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } - LOG_TEE("\n\n"); + LOG("\n"); if (params.interactive) { const char * control_message; @@ -501,11 +481,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -544,7 +524,7 @@ int main(int argc, char ** argv) { llama_token * enc_input_buf = embd_inp.data(); if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -570,9 +550,8 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); - fflush(stdout); } if (ga_n == 1) { @@ -580,29 +559,35 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() >= n_ctx) { - if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + if (!params.ctx_shift){ + LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; + } else { + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG("after swap: n_past = %d\n", n_past); - - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - - LOG("clear session path\n"); - path_session.clear(); } } else { // context extension via Self-Extend @@ -611,10 +596,10 @@ int main(int argc, char ** argv) { const int bd = (ga_w/ga_n)*(ga_n - 1); const int dd = (ga_w/ga_n) - ib*bd - ga_w; - LOG("\n"); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); @@ -624,7 +609,7 @@ int main(int argc, char ** argv) { ga_i += ga_w/ga_n; - LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); } } @@ -656,19 +641,19 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } @@ -686,14 +671,14 @@ int main(int argc, char ** argv) { need_to_save_session = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG("saved session to %s\n", path_session.c_str()); + LOG_DBG("saved session to %s\n", path_session.c_str()); } const llama_token id = gpt_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* apply_grammar= */ true); + gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); - // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -703,16 +688,16 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false); + gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -727,7 +712,7 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id, params.special); // Console/Stream Output - fprintf(stdout, "%s", token_str.c_str()); + LOG("%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check @@ -739,8 +724,6 @@ int main(int argc, char ** argv) { output_tokens.push_back(id); output_ss << token_str; } - - fflush(stdout); } } @@ -789,13 +772,13 @@ int main(int argc, char ** argv) { } if (is_antiprompt) { - LOG("found antiprompt: %s\n", last_output.c_str()); + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); } } // deal with end of generation tokens in interactive mode if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { - LOG("found an EOG token\n"); + LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { @@ -809,7 +792,7 @@ int main(int argc, char ** argv) { chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); } is_interacting = true; - printf("\n"); + LOG("\n"); } } @@ -820,21 +803,21 @@ int main(int argc, char ** argv) { } if (n_past > 0 && is_interacting) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); if (params.conversation) { - printf("\n> "); + LOG("\n> "); } if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); + LOG_DBG("adding input prefix BOS token\n"); embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty() && !params.conversation) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - printf("%s", params.input_prefix.c_str()); + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); } // color user input only @@ -857,11 +840,11 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty() && !params.conversation) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - printf("%s", params.input_suffix.c_str()); + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); @@ -878,7 +861,7 @@ int main(int argc, char ** argv) { const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { @@ -901,9 +884,9 @@ int main(int argc, char ** argv) { assistant_ss.str(""); n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -919,7 +902,7 @@ int main(int argc, char ** argv) { // end of generation if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { - LOG_TEE(" [end of text]\n"); + LOG(" [end of text]\n"); break; } @@ -932,11 +915,11 @@ int main(int argc, char ** argv) { } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - LOG_TEE("\n"); + LOG("\n\n"); gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); @@ -950,9 +933,5 @@ int main(int argc, char ** argv) { ggml_threadpool_free(threadpool); ggml_threadpool_free(threadpool_batch); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - return 0; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7f512d8ad..81e2f7ed7 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -1,7 +1,10 @@ // A basic application simulating a server with multiple clients. // The clients submit requests to the server and they are processed in parallel. +#include "arg.h" #include "common.h" +#include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -81,7 +84,9 @@ static void print_date_time() { char buffer[80]; strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); - printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); + LOG_INF("\n"); + LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer); + LOG_INF("\n"); } // Define a split string function to ... @@ -100,11 +105,12 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } + gpt_init(); + // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -119,12 +125,6 @@ int main(int argc, char ** argv) { const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("parallel", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -137,23 +137,22 @@ int main(int argc, char ** argv) { // load the prompts from an external file if there are any if (params.prompt.empty()) { - printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); + LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); } else { // Output each line of the input params.prompts vector and copy to k_prompts int index = 0; - printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); + LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); std::vector prompts = split_string(params.prompt, '\n'); for (const auto& prompt : prompts) { k_prompts.resize(index + 1); k_prompts[index] = prompt; index++; - printf("%3d prompt: %s\n", index, prompt.c_str()); + LOG_INF("%3d prompt: %s\n", index, prompt.c_str()); } } - fprintf(stderr, "\n\n"); - fflush(stderr); + LOG_INF("\n\n"); const int n_ctx = llama_n_ctx(ctx); @@ -182,19 +181,19 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); - LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); - LOG_TEE("\n"); + LOG_INF("%s: Simulating parallel requests from clients:\n", __func__); + LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_INF("\n"); { - LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); + LOG_INF("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { llama_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -203,10 +202,10 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_TEE("\n"); + LOG_INF("\n"); } - LOG_TEE("Processing requests ...\n\n"); + LOG_INF("Processing requests ...\n\n"); while (true) { if (dump_kv_cache) { @@ -237,7 +236,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_TEE("%s: clearing the KV cache\n", __func__); + LOG_INF("%s: clearing the KV cache\n", __func__); } // insert new sequences for decoding @@ -272,7 +271,7 @@ int main(int argc, char ** argv) { client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; - LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); g_seq_id += 1; @@ -316,11 +315,11 @@ int main(int argc, char ** argv) { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); return 1; } - LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); n_cache_miss += 1; @@ -331,7 +330,7 @@ int main(int argc, char ** argv) { continue; } - LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); + LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens); for (auto & client : clients) { if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { @@ -376,7 +375,7 @@ int main(int argc, char ** argv) { const auto t_main_end = ggml_time_us(); - LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", + LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n", client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, (t_main_end - client.t_start_prompt) / 1e6, (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, @@ -399,22 +398,22 @@ int main(int argc, char ** argv) { print_date_time(); - LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); if (params.prompt_file.empty()) { params.prompt_file = "used built-in defaults"; } - LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); + LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); + LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); - LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Cache misses: %6d\n", n_cache_miss); + LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Cache misses: %6d\n", n_cache_miss); - LOG_TEE("\n"); + LOG_INF("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_batch_free(batch); @@ -423,7 +422,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 76d235c2c..7ef8d14f3 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,4 +1,6 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -7,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -19,11 +21,12 @@ int main(int argc, char ** argv) { params.n_keep = 32; params.i_pos = -1; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } + gpt_init(); + int n_junk = params.n_junk; int n_keep = params.n_keep; int n_grp = params.grp_attn_n; @@ -63,7 +66,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return 1; } @@ -77,7 +80,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return 1; } @@ -107,14 +110,14 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); + LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token - LOG_TEE("\n"); - LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); - LOG_TEE("prompt tokens: %d\n", n_tokens_all); - //LOG_TEE("prompt: %s\n", params.prompt.c_str()); + LOG_INF("\n"); + LOG_INF("prefix tokens: %d\n", n_tokens_prefix); + LOG_INF("prompt tokens: %d\n", n_tokens_all); + //LOG_INF("prompt: %s\n", params.prompt.c_str()); llama_batch batch = llama_batch_init(params.n_batch, 0, 1); @@ -145,11 +148,11 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_INF("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); if (i + n_batch >= n_tokens_all) { break; @@ -159,7 +162,7 @@ int main(int argc, char ** argv) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) { const int n_discard = n_batch; - LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -179,18 +182,18 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); } { const int n_discard = n_past - n_ctx + n_predict; if (n_discard > 0) { - LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -201,17 +204,16 @@ int main(int argc, char ** argv) { } } - LOG_TEE("\n"); - LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); - LOG_TEE("\n"); + LOG_INF("\n"); + LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); + LOG_INF("\n"); // main loop int n_cur = n_tokens_all; int n_decode = 0; - LOG_TEE("%s", prompt_suffix.c_str()); - fflush(stdout); + LOG_INF("%s", prompt_suffix.c_str()); const auto t_main_start = ggml_time_us(); @@ -220,17 +222,14 @@ int main(int argc, char ** argv) { { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); - llama_sampler_accept(smpl, new_token_id); - // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); n_decode += 1; @@ -245,22 +244,22 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_sampler_free(smpl); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 570ee8aeb..18e75a7a2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,18 +1,21 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" +#include +#include +#include #include #include #include #include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -40,7 +43,7 @@ static void write_logfile( } if (params.hellaswag) { - fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); + LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); return; } @@ -48,7 +51,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -57,7 +60,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -343,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & prob_history.resize(tokens.size()); if (params.ppl_stride <= 0) { - fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } const int calc_chunk = n_ctx; - fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); + LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); if (int(tokens.size()) <= calc_chunk) { - fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, + LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, tokens.size(), n_ctx, params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } @@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & int count = 0; double nll = 0.0; - fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { const int start = i * params.ppl_stride; const int end = start + calc_chunk; const int num_batches = (calc_chunk + n_batch - 1) / n_batch; - //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); + //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); std::vector logits; @@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - //fprintf(stderr, "%s : failed to eval\n", __func__); + //LOG_ERR("%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -433,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); - //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); + //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. @@ -459,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & } // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); } else { - printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); } - fflush(stdout); } - printf("\n"); + LOG("\n"); return {tokens, std::exp(nll / count), logit_history, prob_history}; } @@ -487,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (!params.logits_file.empty()) { logits_stream.open(params.logits_file.c_str(), std::ios::binary); if (!logits_stream.is_open()) { - fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); return {}; } - fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); + LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); logits_stream.write("_logits_", 8); logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); } auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -539,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par logits.reserve((size_t)n_ctx * n_vocab); } - fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -612,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_INF("%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -627,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par llama_synchronize(ctx); const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total*n_chunk/n_seq); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); for (int seq = 0; seq < n_seq_batch; seq++) { const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); @@ -655,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); } else { double av = nll/count; double av2 = nll2/count - av*av; if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } } - fflush(stdout); logits.clear(); } - printf("\n"); + LOG("\n"); nll2 /= count; nll /= count; @@ -675,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + LOG_ERR("Unexpected negative standard deviation of log(prob)\n"); } llama_batch_free(batch); @@ -703,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -789,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } if (prompt_lines.size() % 6 != 0) { - fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); + LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__); return; } size_t hs_task_count = prompt_lines.size()/6; - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); + LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; - fprintf(stderr, "================================= is_spm = %d\n", is_spm); + LOG_INF("================================= is_spm = %d\n", is_spm); // The tasks should be randomized so the score stabilizes quickly. bool randomize_tasks = true; @@ -824,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector seq_tokens[4]; }; - fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); + LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); // Select and read data from prompt lines std::vector hs_data(hs_task_count); @@ -870,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } } - fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); + LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__); - printf("\ntask\tacc_norm\n"); + LOG("\ntask\tacc_norm\n"); double acc = 0.0f; @@ -940,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -948,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -998,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } } - //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); + //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); // If the gold ending got the maximum logprobe add one accuracy point if (ending_logprob_max_idx == hs_cur.gold_ending_idx) { @@ -1006,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } // Print the accumulated accuracy mean x 100 - printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); - fflush(stdout); + LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); } i0 = i1 - 1; @@ -1015,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { llama_batch_free(batch); - printf("\n"); + LOG("\n"); } struct winogrande_entry { @@ -1059,7 +1061,7 @@ static std::vector load_winogrande_from_csv(const std::string } } if (ipos != 4) { - printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); + LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); continue; } auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) @@ -1073,13 +1075,13 @@ static std::vector load_winogrande_from_csv(const std::string if (sentence[where] == '_') break; } if (where == int(sentence.size())) { - printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); + LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str()); continue; } std::istringstream stream(answer.c_str()); int i_answer; stream >> i_answer; if (stream.fail() || i_answer < 1 || i_answer > 2) { - printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); + LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); continue; } result.emplace_back(); @@ -1108,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { auto data = load_winogrande_from_csv(params.prompt); if (data.empty()) { - fprintf(stderr, "%s: no tasks\n", __func__); + LOG_ERR("%s: no tasks\n", __func__); return; } - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); + LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size()); if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { - fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); + LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); std::mt19937 rng(1); std::vector aux(data.size()); for (int i = 0; i < int(data.size()); ++i) { @@ -1133,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { data = std::move(selected); } - fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); + LOG_INF("%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); @@ -1156,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); } - fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); + LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -1217,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1225,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -1285,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { ++n_done; // print the accumulated accuracy mean x 100 - printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); - fflush(stdout); + LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); } i0 = i1 - 1; } - printf("\n"); + LOG("\n"); if (n_done < 100) return; const float p = 1.f*n_correct/n_done; const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); - printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); + + LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); } static bool deserialize_string(std::istream & in, std::string & str) { @@ -1347,7 +1349,7 @@ struct multiple_choice_task { static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { if (task.question.empty() || task.mc1.answers.empty()) { if (log_error) { - printf("%s: found bad task with empty question and/or answers\n", __func__); + LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__); } return false; } @@ -1355,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic for (auto& answer : task.mc1.answers) { if (answer.empty()) { if (log_error) { - printf("%s: found empty answer\n", __func__); + LOG_ERR("%s: found empty answer\n", __func__); } return false; } @@ -1409,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); if (strstream.fail() || n_task == 0) { - printf("%s: no tasks\n", __func__); + LOG_ERR("%s: no tasks\n", __func__); return; } - printf("%s: there are %u tasks in prompt\n", __func__, n_task); + LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task); std::vector task_pos(n_task); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); if (strstream.fail()) { - printf("%s: failed to read task positions from prompt\n", __func__); + LOG_ERR("%s: failed to read task positions from prompt\n", __func__); return; } @@ -1424,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { // Use all tasks tasks.resize(n_task); - printf("%s: reading tasks", __func__); + LOG_INF("%s: reading tasks", __func__); int n_dot = std::max((int) n_task/100, 1); int i = 0; for (auto& task : tasks) { ++i; if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d of %u\n", __func__, i, n_task); + LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task); return; } - if (i%n_dot == 0) printf("."); + if (i%n_dot == 0) LOG("."); } - printf("done\n"); + LOG("done\n"); } else { - printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); + LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); std::mt19937 rng(1); std::vector aux(n_task); for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; @@ -1451,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params aux.pop_back(); strstream.seekg(task_pos[idx], std::ios::beg); if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); + LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); return; } } n_task = params.multiple_choice_tasks; } - printf("%s: preparing task data", __func__); - fflush(stdout); + LOG_INF("%s: preparing task data", __func__); if (n_task > 500) { - printf("..."); - fflush(stdout); + LOG("..."); std::atomic counter(0); std::atomic n_bad(0); auto prepare = [&counter, &n_bad, &tasks, ctx] () { @@ -1486,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params for (auto& w : workers) w = std::thread(prepare); prepare(); for (auto& w : workers) w.join(); - printf("done\n"); - fflush(stdout); + LOG("done\n"); int nbad = n_bad; if (nbad > 0) { - printf("%s: found %d malformed tasks\n", __func__, nbad); + LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad); return; } } else { @@ -1502,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } if (i_task%n_dot == 0) { - printf("."); - fflush(stdout); + LOG("."); } } - printf("done\n"); + LOG("done\n"); } - printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); + LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); - printf("\ntask\tacc_norm\n"); + LOG("\ntask\tacc_norm\n"); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -1590,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1598,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -1622,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // compute the logprobs for each ending of the decoded tasks for (size_t i = i0; i < i1; ++i) { auto & cur_task = tasks[i]; - //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); + //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { // if (cur_task.mc1.labels[j] == 1) { - // printf("%d", j+1); + // LOG("%d", j+1); // } //} - //printf("\n common_prefix: %zu\n", cur_task.common_prefix); + //LOG("\n common_prefix: %zu\n", cur_task.common_prefix); // get the logits of the last token of the common prefix std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); @@ -1640,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t count = 1; float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - //printf(" %zu %g\n", ir, eval_results[ir]); + //LOG(" %zu %g\n", ir, eval_results[ir]); ++count; log_prob += eval_results[ir++]; } cur_task.log_probs[s] = log_prob / count; - //printf(" Final: %g\n", log_prob / count); - //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); + //LOG(" Final: %g\n", log_prob / count); + //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); } // Find the ending with maximum logprob @@ -1666,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params ++n_done; // Print the accumulated accuracy mean x 100 - printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); - fflush(stdout); + LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); } i0 = i1 - 1; @@ -1679,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params float p = 1.f*n_correct/n_done; float sigma = sqrt(p*(1-p)/(n_done-1)); - printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + LOG("\n"); + LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); p = 1.f*n_done/n_tot_answers; sigma = sqrt(p*(1-p)/(n_done-1)); - printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); - printf("\n"); + LOG_INF("\n"); } static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (params.logits_file.empty()) { - fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); + LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; } std::ifstream in(params.logits_file.c_str(), std::ios::binary); if (!in) { - fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str()); return; } { char check[9]; check[8] = 0; in.read(check, 8); if (in.fail() || strncmp("_logits_", check, 8) != 0) { - fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); return; } } @@ -1709,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { uint32_t n_ctx; in.read((char *)&n_ctx, sizeof(n_ctx)); if (n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", + LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } @@ -1717,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_chunk, sizeof(n_chunk)); if (in.fail()) { - fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); return; } if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { - fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); + LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } std::vector tokens(n_ctx * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { - fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; } @@ -1775,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); + LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i); return; } @@ -1796,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return; } @@ -1813,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - - printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); + LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); @@ -1831,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; - printf("%4d", i+1); + LOG("%4d", i+1); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); + LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); + LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); + LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); double p_top_val = 1.*kld.n_same_top/kld.count; double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); - printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); - printf("\n"); - - fflush(stdout); + LOG("\n"); logits.clear(); } - printf("\n"); + LOG("\n"); if (kld.count < 100) return; // we do not wish to do statistics on so few values std::sort(kld_values.begin(), kld_values.end()); std::sort(p_diff_values.begin(), p_diff_values.end()); - printf("====== Perplexity statistics ======\n"); + LOG("====== Perplexity statistics ======\n"); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); + LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double ppl_base_val = exp(log_ppl_base.first); const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) - printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); + LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); + // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); - printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); + LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); + LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); const double ppl_ratio_val = exp(log_ppl_ratio_val); const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) - printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); + LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; const double ppl_diff_val = ppl_val - ppl_base_val; const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); - printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); + LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); - printf("\n"); + LOG("\n"); - printf("====== KL divergence statistics ======\n"); + LOG("====== KL divergence statistics ======\n"); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); + LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) : kld_values[kld_values.size()/2]; @@ -1915,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; }; - printf("Maximum KLD: %10.6f\n", kld_values.back()); - printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("Median KLD: %10.6f\n", kld_median); - printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); - printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); - printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); - printf("Minimum KLD: %10.6f\n", kld_values.front()); + LOG("Maximum KLD: %10.6f\n", kld_values.back()); + LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); + LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + LOG("Median KLD: %10.6f\n", kld_median); + LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); + LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); + LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); + LOG("Minimum KLD: %10.6f\n", kld_values.front()); - printf("\n"); + LOG("\n"); - printf("====== Token probability statistics ======\n"); + LOG("====== Token probability statistics ======\n"); auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); - printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); + LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) : p_diff_values[p_diff_values.size()/2]; - printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); - printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); - printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); - printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); - printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); - printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); - printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); - printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); - printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); - printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); - printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); - printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); - printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); + LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); + LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); + LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); + LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); + LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); + LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); + LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); + LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); + LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); + LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); + LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); + LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); + LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); + // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); const double same_top_p = 1.0*kld.n_same_top/kld.count; - printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); - + LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); } int main(int argc, char ** argv) { @@ -1967,15 +1962,16 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; } + gpt_init(); + const int32_t n_ctx = params.n_ctx; if (n_ctx <= 0) { - fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); + LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__); return 1; } @@ -2000,15 +1996,11 @@ int main(int argc, char ** argv) { } if (params.ppl_stride > 0) { - fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", + LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", params.n_ctx, params.n_ctx + params.ppl_stride/2); params.n_ctx += params.ppl_stride/2; } - print_build_info(); - - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); @@ -2018,21 +2010,21 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } struct results_perplexity results; @@ -2048,8 +2040,9 @@ int main(int argc, char ** argv) { results = perplexity(ctx, params, n_ctx); } - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); + write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 3ee4eb971..62680cda4 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..704f0d56b 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. +The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. + *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index dd8a82e6e..5971690f1 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,13 +1,16 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include #include +#include // TODO: remove me static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); + LOG("\n"); } struct chunk { @@ -16,7 +19,7 @@ struct chunk { // original file position size_t filepos; // original text data - std::string textdata = ""; + std::string textdata; // tokenized text data std::vector tokens; // embedding @@ -30,14 +33,14 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz std::ifstream f(filename.c_str()); if (!f.is_open()) { - fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); + LOG_ERR("could not open file %s\n", filename.c_str()); return chunks; } chunk current_chunk; char buffer[1024]; int64_t filepos = 0; - std::string current = ""; + std::string current; while (f.read(buffer, 1024)) { current += std::string(buffer, f.gcount()); size_t pos; @@ -83,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -98,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu if (embd == NULL) { embd = llama_get_embeddings_ith(ctx, i); if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i); continue; } } @@ -111,29 +114,28 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } + gpt_init(); + // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; params.embedding = true; if (params.chunk_size <= 0) { - fprintf(stderr, "chunk_size must be positive\n"); + LOG_ERR("chunk_size must be positive\n"); return 1; } if (params.context_files.empty()) { - fprintf(stderr, "context_files must be specified\n"); + LOG_ERR("context_files must be specified\n"); return 1; } - print_build_info(); - - printf("processing files:\n"); + LOG_INF("processing files:\n"); for (auto & context_file : params.context_files) { - printf("%s\n", context_file.c_str()); + LOG_INF("%s\n", context_file.c_str()); } std::vector chunks; @@ -141,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - printf("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %ld\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); @@ -153,7 +155,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } @@ -162,19 +164,19 @@ int main(int argc, char ** argv) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + LOG_ERR("%s: pooling type NONE not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } // max batch size @@ -185,7 +187,7 @@ int main(int argc, char ** argv) { for (auto & chunk : chunks) { auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -199,12 +201,12 @@ int main(int argc, char ** argv) { // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) chunks.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); + LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } - fprintf(stderr, "\n\n"); + LOG_INF("\n\n"); } } @@ -256,7 +258,7 @@ int main(int argc, char ** argv) { // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { - printf("Enter query: "); + LOG("Enter query: "); std::getline(std::cin, query); std::vector query_tokens = llama_tokenize(ctx, query, true); @@ -280,19 +282,19 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - printf("Top %d similar chunks:\n", params.sparams.top_k); + LOG("Top %d similar chunks:\n", params.sparams.top_k); for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { - printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); - printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); - printf("similarity: %f\n", similarities[i].second); - printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); - printf("--------------------\n"); + LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); + LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); + LOG("similarity: %f\n", similarities[i].second); + LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); + LOG("--------------------\n"); } } } - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_context_print(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/rpc/README.md b/examples/rpc/README.md index adedc8909..312bb634d 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following ```mermaid flowchart TD - rpcb---|TCP|srva - rpcb---|TCP|srvb - rpcb-.-|TCP|srvn + rpcb<-->|TCP|srva + rpcb<-->|TCP|srvb + rpcb<-.->|TCP|srvn subgraph hostn[Host N] - srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] + srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"] end subgraph hostb[Host B] - srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] + srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"] end subgraph hosta[Host A] - srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] + srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"] end subgraph host[Main Host] - ggml[llama.cpp]---rpcb[RPC backend] + local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli] + ggml[llama-cli]<-->rpcb[RPC backend] end style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 ``` @@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: - -```bash -mkdir build-rpc -cd build-rpc -cmake .. -DGGML_RPC=ON -cmake --build . --config Release -``` - -Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: +On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options. +Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`: ```bash $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 ``` + +This way you can offload model layers to both local and remote devices. + diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index b54ec3bd8..0117d9357 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -10,8 +11,7 @@ int main(int argc, char ** argv) { params.prompt = "The quick brown fox"; params.sparams.seed = 1234; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -74,8 +74,6 @@ int main(int argc, char ** argv) { auto next_token = llama_sampler_sample(smpl, ctx, -1); auto next_token_str = llama_token_to_piece(ctx, next_token); - llama_sampler_accept(smpl, next_token); - printf("%s", next_token_str.c_str()); result0 += next_token_str; @@ -132,8 +130,6 @@ int main(int argc, char ** argv) { auto next_token = llama_sampler_sample(smpl2, ctx2, -1); auto next_token_str = llama_token_to_piece(ctx2, next_token); - llama_sampler_accept(smpl2, next_token); - printf("%s", next_token_str.c_str()); result1 += next_token_str; @@ -222,8 +218,6 @@ int main(int argc, char ** argv) { auto next_token = llama_sampler_sample(smpl3, ctx3, -1); auto next_token_str = llama_token_to_piece(ctx3, next_token); - llama_sampler_accept(smpl3, next_token); - printf("%s", next_token_str.c_str()); result2 += next_token_str; diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index dbe41f1fd..3e717e882 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) + +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) @@ -30,6 +30,7 @@ set(PUBLIC_ASSETS system-prompts.js prompt-formats.js json-schema-to-grammar.mjs + loading.html ) foreach(asset ${PUBLIC_ASSETS}) @@ -45,9 +46,6 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server/README.md b/examples/server/README.md index ed1201ba8..168e14a99 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co | `--version` | show version and build info | | `-v, --verbose` | print verbose information | | `--verbosity N` | set specific verbosity level (default: 0) | -| `--verbose-prompt` | print a verbose prompt before generation (default: false) | -| `--no-display-prompt` | don't print prompt at generation (default: false) | -| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | | `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)
| +| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)
| | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | +| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | -| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | -| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | | `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | -| `--chunks N` | max number of chunks to process (default: -1, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | | `-p, --prompt PROMPT` | prompt to start generation with | | `-f, --file FNAME` | a file containing the prompt (default: none) | -| `--in-file FNAME` | an input file (repeat to specify multiple files) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | -| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--penalize-nl` | penalize newline tokens (default: false) | @@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1) | -| `-ns, --sequences N` | number of sequences to decode (default: 1) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | -| `-ngl, --gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | @@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_MODEL) | +| `-a, --alias STRING` | set alias for model name (to be used by REST API) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | @@ -123,10 +118,9 @@ The project is under active development, and we are [looking for feedback and co | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate | -| `--timeout N` | server read/write timeout in seconds (default: 600) | +| `-to, --timeout N` | server read/write timeout in seconds (default: 600) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | -| `--log-format {text, json}` | log output format: json or text (default: json) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | @@ -412,9 +406,44 @@ Notice that each `probs` is an array of length `n_probs`. *Options:* - `content`: Set the text to tokenize. + `content`: (Required) The text to tokenize. - `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` + +**Response:** + +Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise. + + +If `with_pieces` is `false`: +```json +{ + "tokens": [123, 456, 789] +} +``` + +If `with_pieces` is `true`: +```json +{ + "tokens": [ + {"id": 123, "piece": "Hello"}, + {"id": 456, "piece": " world"}, + {"id": 789, "piece": "!"} + ] +} +``` + +With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k +```json +{ + "tokens": [ + {"id": 198, "piece": [195]}, // hex C3 + {"id": 164, "piece": [161]} // hex A1 + ] +} +``` ### POST `/detokenize`: Convert tokens to text diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 0f18ca396..353368e13 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -40,7 +40,6 @@ server --host localhost --port 8080 \ --parallel 8 \ --batch-size 512 \ --ctx-size 4096 \ - --log-format text \ -ngl 33 ``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 2daac0884..a9ed747f5 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -272,7 +272,6 @@ def start_server_background(args): server_args.append('--cont-batching') server_args.append('--metrics') server_args.append('--flash-attn') - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") pkwargs = { diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html new file mode 100644 index 000000000..c3fd19a0f --- /dev/null +++ b/examples/server/public/loading.html @@ -0,0 +1,12 @@ + + + + + + +
+ The model is loading. Please wait.
+ The user interface will appear soon. +
+ + diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9ab8f8ca6..b5f264ff1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,9 @@ #include "utils.hpp" +#include "arg.h" #include "common.h" +#include "log.h" +#include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" @@ -26,24 +29,37 @@ #include "system-prompts.js.hpp" #include "prompt-formats.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include "loading.html.hpp" #include -#include #include #include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) using json = nlohmann::ordered_json; -bool server_verbose = false; -bool server_log_json = true; - enum stop_type { STOP_TYPE_FULL, STOP_TYPE_PARTIAL, @@ -194,6 +210,8 @@ struct server_slot { std::function callback_on_release; void reset() { + SLT_DBG(*this, "%s", "\n"); + n_prompt_tokens = 0; generated_text = ""; truncated = false; @@ -231,8 +249,9 @@ struct server_slot { return state != SLOT_STATE_IDLE; } - void add_token_string(const completion_token_output & token) { + void add_token(const completion_token_output & token) { if (!is_processing()) { + SLT_WRN(*this, "%s", "slot is not processing\n"); return; } generated_token_probs.push_back(token); @@ -240,14 +259,10 @@ struct server_slot { void release() { if (is_processing()) { + SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); + t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; state = SLOT_STATE_IDLE; - LOG_INFO("slot released", { - {"id_slot", id}, - {"id_task", id_task}, - {"n_past", n_past}, - {"truncated", truncated}, - }); callback_on_release(id); } } @@ -295,49 +310,20 @@ struct server_slot { } void print_timings() const { - char buffer[512]; + const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; + const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - double t_token = t_prompt_processing / n_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + const double t_gen = t_token_generation / n_decoded; + const double n_gen_second = 1e3 / t_token_generation * n_decoded; - snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, n_prompt_tokens_processed, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"n_prompt_tokens_processed", n_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - - snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); + SLT_INF(*this, + "\n" + "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r total time = %10.2f ms / %5d tokens\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, + t_token_generation, n_decoded, t_gen, n_gen_second, + t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); } }; @@ -413,8 +399,8 @@ struct server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d, front = %d\n", task.id, front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -430,8 +416,8 @@ struct server_queue { for (auto & task : tasks) { if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -445,6 +431,7 @@ struct server_queue { // Add a new task, but defer until one slot is available void defer(server_task task) { std::unique_lock lock(mutex_tasks); + QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.push_back(std::move(task)); condition_tasks.notify_one(); } @@ -453,7 +440,6 @@ struct server_queue { int get_new_id() { std::unique_lock lock(mutex_tasks); int new_id = id++; - LOG_VERBOSE("new task id", {{"new_id", new_id}}); return new_id; } @@ -495,7 +481,7 @@ struct server_queue { running = true; while (true) { - LOG_VERBOSE("new task may arrive", {}); + QUE_DBG("%s", "processing new tasks\n"); while (true) { std::unique_lock lock(mutex_tasks); @@ -506,21 +492,22 @@ struct server_queue { server_task task = queue_tasks.front(); queue_tasks.pop_front(); lock.unlock(); - LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); + + QUE_DBG("processing task, id = %d\n", task.id); callback_new_task(task); } // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_update_slots", {}); + QUE_DBG("%s", "update slots\n"); callback_update_slots(); - LOG_VERBOSE("wait for new task", {}); + QUE_DBG("%s", "waiting for new tasks\n"); { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { if (!running) { - LOG_VERBOSE("ending start_loop", {}); + QUE_DBG("%s", "terminate\n"); return; } condition_tasks.wait(lock, [&]{ @@ -544,7 +531,7 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - LOG_VERBOSE("waiting for task id", {{"id_task", id_task}}); + SRV_DBG("waiting for task id = %d\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.insert(id_task); @@ -558,7 +545,7 @@ struct server_response { // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); + SRV_DBG("task id = %d is done\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); @@ -592,12 +579,13 @@ struct server_response { // Send a new result to a waiting id_task void send(server_task_result & result) { - LOG_VERBOSE("send new result", {{"id_task", result.id}}); + SRV_DBG("sending result for task id = %d\n", result.id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { if (result.id == id_task) { - LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}}); + SRV_DBG("task id = %d moved to result queue\n", result.id); + queue_results.push_back(std::move(result)); condition_results.notify_all(); return; @@ -609,11 +597,11 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector lora_adapters; + std::vector loras; gpt_params params; - llama_batch batch; + llama_batch batch = {}; bool clean_kv_cache = true; bool add_bos_token = true; @@ -669,11 +657,13 @@ struct server_context { llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; - ctx = llama_init.context; - lora_adapters = llama_init.lora_adapters; + ctx = llama_init.context; + loras = llama_init.lora_adapters; + params.n_parallel -= 1; // but be sneaky about it + if (model == nullptr) { - LOG_ERROR("unable to load model", {{"model", params.model}}); + SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); return false; } @@ -696,7 +686,7 @@ struct server_context { void init() { const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -705,10 +695,7 @@ struct server_context { slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { - {"id_slot", slot.id}, - {"n_ctx_slot", slot.n_ctx} - }); + SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -719,11 +706,7 @@ struct server_context { //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { - {"id_slot", slot.id}, - {"ga_n", ga_n}, - {"ga_w", ga_w} - }); + SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); } slot.ga_i = 0; @@ -846,11 +829,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lcp similarity", { - {"id_slot", ret->id}, - {"max_lcp_len", max_lcp_len}, - {"similarity", similarity}, - }); + SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); } } @@ -871,10 +850,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lru", { - {"id_slot", ret->id}, - {"t_last", t_last}, - }); + SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last); } } @@ -938,17 +914,14 @@ struct server_context { } if (slot.params.cache_prompt && slot.ga_n != 1) { - LOG_WARNING("cache_prompt is not supported with group-attention", {}); slot.params.cache_prompt = false; + SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? - LOG_WARNING("Max tokens to predict exceeds server configuration", { - {"params.n_predict", slot.params.n_predict}, - {"slot.n_predict", slot.n_predict}, - }); slot.params.n_predict = slot.n_predict; + SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } // infill @@ -1057,16 +1030,13 @@ struct server_context { slot.state = SLOT_STATE_PROCESSING_PROMPT; slot.prompt_tokens.clear(); - LOG_INFO("slot is processing task", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - }); + SLT_INF(slot, "%s", "processing task\n"); return true; } void kv_cache_clear() { - LOG_VERBOSE("clearing KV cache", {}); + SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache llama_kv_cache_clear(ctx); @@ -1074,9 +1044,7 @@ struct server_context { } void system_prompt_update() { - LOG_VERBOSE("system prompt update", { - {"system_prompt", system_prompt}, - }); + SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); kv_cache_clear(); system_tokens.clear(); @@ -1097,7 +1065,7 @@ struct server_context { } if (llama_decode(ctx, batch) != 0) { - LOG_ERROR("llama_decode() failed", {}); + SRV_ERR("%s", "llama_decode() failed\n"); return; } } @@ -1112,11 +1080,9 @@ struct server_context { } bool system_prompt_set(const std::string & sys_prompt) { - system_prompt = sys_prompt; + SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - LOG_VERBOSE("system prompt process", { - {"system_prompt", system_prompt}, - }); + system_prompt = sys_prompt; // release all slots for (server_slot & slot : slots) { @@ -1184,7 +1150,7 @@ struct server_context { // add the token to slot queue and cache } - slot.add_token_string(result); + slot.add_token(result); if (slot.params.stream) { send_partial_response(slot, result); } @@ -1199,55 +1165,30 @@ struct server_context { slot.stopped_limit = true; slot.has_next_token = false; - LOG_VERBOSE("stopped by limit", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_decoded", slot.n_decoded}, - {"n_predict", slot.params.n_predict}, - }); + SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } if (llama_token_is_eog(model, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); + SLT_DBG(slot, "%s", "stopped by EOS\n"); } - auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 - && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { - LOG_WARNING("n_predict is not set and self-context extend is disabled." - " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { - { "id_slot", slot.id }, - { "params.n_predict", slot.params.n_predict }, - { "slot.n_prompt_tokens", slot.n_prompt_tokens }, - { "slot.n_decoded", slot.n_decoded }, - { "slot.n_predict", slot.n_predict }, - { "n_slots", params.n_parallel }, - { "slot.n_ctx", slot.n_ctx }, - { "n_ctx", n_ctx }, - { "n_ctx_train", n_ctx_train }, - { "ga_n", slot.ga_n }, - }); + const auto n_ctx_train = llama_n_ctx_train(model); + + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction + + SLT_WRN(slot, + "n_predict (%d) is not set and self-context extend is disabled. " + "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", + slot.params.n_predict, n_ctx_train); } - LOG_VERBOSE("next token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); return slot.has_next_token; // continue } @@ -1264,6 +1205,7 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, + {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, @@ -1303,10 +1245,7 @@ struct server_context { } void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - LOG_ERROR("task error", { - {"id_task", id_task}, - {"error", error}, - }); + SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); server_task_result res; res.id = id_task; @@ -1425,10 +1364,7 @@ struct server_context { } if (embd == NULL) { - LOG_ERROR("failed to get embeddings", { - {"token", batch.token [i]}, - {"seq_id", batch.seq_id[i][0]} - }); + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); res.data = json { {"embedding", std::vector(n_embd, 0.0f)}, @@ -1445,6 +1381,8 @@ struct server_context { }; } + SLT_DBG(slot, "%s", "sending embeddings\n"); + queue_results.send(res); } @@ -1461,7 +1399,7 @@ struct server_context { task.type = SERVER_TASK_TYPE_COMPLETION; if (replace_prompt) { task.data = task_data; - task.data["prompt"] = prompt; + task.data["prompt"] = std::move(prompt); } else { task.data = std::move(task_data); } @@ -1505,7 +1443,8 @@ struct server_context { std::vector cancel_tasks; cancel_tasks.reserve(id_tasks.size()); for (const auto & id_task : id_tasks) { - LOG_VERBOSE("cancel task", {{"id_task", id_task}}); + SRV_WRN("cancel task, id_task = %d\n", id_task); + server_task task; task.type = SERVER_TASK_TYPE_CANCEL; task.id_target = id_task; @@ -1517,7 +1456,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl - void receive_cmpl_results(const std::unordered_set & id_tasks, std::function&)> result_handler, std::function error_handler) { + void receive_cmpl_results( + const std::unordered_set & id_tasks, + const std::function&)> & result_handler, + const std::function & error_handler) { // TODO: currently, there is no way to detect the client has cancelled the request std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { @@ -1536,7 +1478,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl, in stream mode - void receive_cmpl_results_stream(const std::unordered_set & id_tasks, std::function result_handler, std::function error_handler) { + void receive_cmpl_results_stream( + const std::unordered_set & id_tasks, const + std::function & result_handler, const + std::function & error_handler) { size_t n_finished = 0; while (true) { server_task_result result = queue_results.recv(id_tasks); @@ -1584,13 +1529,13 @@ struct server_context { if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); + SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1612,7 +1557,7 @@ struct server_context { slot->index = json_value(task.data, "index", 0); if (!launch_slot_with_task(*slot, task)) { - LOG_ERROR("error while launching slot", task.data); + SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); break; } } break; @@ -1661,18 +1606,7 @@ struct server_context { slots_data.push_back(slot_data); } - LOG_INFO("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots} - }); - - LOG_VERBOSE("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots}, - {"slots", slots_data} - }); + SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); server_task_result res; res.id = task.id; @@ -1718,7 +1652,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1759,7 +1693,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1807,7 +1741,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1829,7 +1763,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - llama_lora_adapters_apply(ctx, lora_adapters); + llama_lora_adapters_apply(ctx, loras); server_task_result result; result.id = task.id; result.stop = true; @@ -1857,7 +1791,7 @@ struct server_context { } if (all_idle) { - LOG_INFO("all slots are idle", {}); + SRV_INF("%s", "all slots are idle\n"); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); } @@ -1867,7 +1801,7 @@ struct server_context { } { - LOG_VERBOSE("posting NEXT_RESPONSE", {}); + SRV_DBG("%s", "posting NEXT_RESPONSE\n"); server_task task; task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; @@ -1886,17 +1820,7 @@ struct server_context { const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - LOG_INFO("slot context shift", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_keep", n_keep}, - {"n_left", n_left}, - {"n_discard", n_discard}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()} - }); + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1939,15 +1863,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - LOG_VERBOSE("slot decode token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()}, - {"truncated", slot.truncated} - }); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1968,10 +1885,7 @@ struct server_context { // we haven't tokenized the prompt yet - do it now: if (prompt_tokens.empty()) { - LOG_VERBOSE("tokenizing prompt", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size()); slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; @@ -2015,21 +1929,11 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("prompt tokenized", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { - LOG_INFO("empty prompt - releasing slot", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); slot.release(); slot.print_timings(); @@ -2071,15 +1975,7 @@ struct server_context { slot.truncated = true; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("input truncated", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } @@ -2104,10 +2000,7 @@ struct server_context { if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { - { "id_slot", slot.id }, - { "id_task", slot.id_task } - }); + SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; if (slot.ga_i > 0) { @@ -2156,11 +2049,7 @@ struct server_context { // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - LOG_INFO("kv cache rm [p0, end)", { - { "id_slot", slot.id }, - { "id_task", slot.id_task }, - { "p0", p0 } - }); + SLT_INF(slot, "kv cache rm [%d, end)\n", p0); int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; @@ -2189,13 +2078,7 @@ struct server_context { slot_npast++; } - LOG_VERBOSE("prompt processing progress", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, - }); + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); // entire prompt has been processed if (slot.n_past == slot.n_prompt_tokens) { @@ -2209,12 +2092,7 @@ struct server_context { slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - LOG_VERBOSE("prompt done", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - }); + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); } } @@ -2225,13 +2103,11 @@ struct server_context { } if (batch.n_tokens == 0) { - LOG_VERBOSE("no tokens to decode", {}); + SRV_WRN("%s", "no tokens to decode\n"); return; } - LOG_VERBOSE("decoding batch", { - {"n_tokens", batch.n_tokens}, - }); + SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); // make sure we're in the right embedding mode llama_set_embeddings(ctx, batch_type == 1); @@ -2249,10 +2125,9 @@ struct server_context { const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); @@ -2262,7 +2137,7 @@ struct server_context { slot.ga_i += slot.ga_w / slot.ga_n; - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); } slot.n_past_se += n_tokens; @@ -2286,11 +2161,7 @@ struct server_context { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); for (auto & slot : slots) { slot.release(); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); @@ -2302,11 +2173,7 @@ struct server_context { n_batch /= 2; i -= n_batch; - LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); continue; // continue loop of n_batch } @@ -2366,7 +2233,7 @@ struct server_context { } } - LOG_VERBOSE("run slots completed", {}); + SRV_DBG("%s", "run slots completed\n"); } json model_meta() const { @@ -2387,19 +2254,18 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } - LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + //LOG_INFO("request", { + // {"remote_addr", req.remote_addr}, + // {"remote_port", req.remote_port}, + // {"status", res.status}, + // {"method", req.method}, + // {"path", req.path}, + // {"params", req.params}, + //}); + LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + LOG_DBG("request: %s\n", req.body.c_str()); + LOG_DBG("response: %s\n", res.body.c_str()); } std::function shutdown_handler; @@ -2417,20 +2283,18 @@ inline void signal_handler(int signal) { } int main(int argc, char ** argv) { -#if SERVER_VERBOSE != 1 - log_disable(); -#endif // own arguments required by this example gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } - // TODO: not great to use extern vars - server_log_json = params.log_json; - server_verbose = params.verbosity > 0; + gpt_init(); + + // enabling this will output extra debug information in the HTTP responses from the server + // see format_final_response_oaicompat() + const bool verbose = params.verbosity > 9; // struct that contains llama context and inference server_context ctx_server; @@ -2446,17 +2310,10 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - LOG_INFO("build info", { - {"build", LLAMA_BUILD_NUMBER}, - {"commit", LLAMA_COMMIT} - }); - - LOG_INFO("system info", { - {"n_threads", params.cpuparams.n_threads}, - {"n_threads_batch", params.cpuparams_batch.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -2488,13 +2345,13 @@ int main(int argc, char ** argv) { svr->set_logger(log_server_request); - auto res_error = [](httplib::Response & res, json error_data) { + auto res_error = [](httplib::Response & res, const json & error_data) { json final_response {{"error", error_data}}; res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; - auto res_ok = [](httplib::Response & res, json data) { + auto res_ok = [](httplib::Response & res, const json & data) { res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = 200; }; @@ -2502,7 +2359,7 @@ int main(int argc, char ** argv) { svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { std::string message; try { - std::rethrow_exception(std::move(ep)); + std::rethrow_exception(ep); } catch (std::exception & e) { message = e.what(); } catch (...) { @@ -2510,7 +2367,7 @@ int main(int argc, char ** argv) { } json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_VERBOSE("Got exception", formatted_error); + LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); res_error(res, formatted_error); }); @@ -2585,15 +2442,21 @@ int main(int argc, char ** argv) { // API key is invalid or not provided res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - LOG_WARNING("Unauthorized: Invalid API Key", {}); + LOG_WRN("Unauthorized: Invalid API Key\n"); return false; }; - auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) { + auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) { server_state current_state = state.load(); if (current_state == SERVER_STATE_LOADING_MODEL) { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + auto tmp = string_split(req.path, '.'); + if (req.path == "/" || tmp.back() == "html") { + res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); + res.status = 503; + } else { + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + } return false; } return true; @@ -2916,14 +2779,14 @@ int main(int argc, char ** argv) { } res_ok(res, arr); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { return server_sent_event(sink, "data", result.data); - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); sink.done(); @@ -2944,7 +2807,7 @@ int main(int argc, char ** argv) { }; // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2961,16 +2824,16 @@ int main(int argc, char ** argv) { const auto completion_id = gen_chatcmplid(); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id); + json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); res_ok(res, result_oai); - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); for (auto & event_data : result_array) { if (event_data.empty()) { @@ -2981,9 +2844,11 @@ int main(int argc, char ** argv) { } } return true; // ok - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); sink.done(); return true; }; @@ -3011,12 +2876,39 @@ int main(int argc, char ** argv) { const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - std::vector tokens; + json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); - tokens = ctx_server.tokenize(body.at("content"), add_special); + const bool with_pieces = json_value(body, "with_pieces", false); + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = llama_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + + tokens_response.push_back({ + {"id", token}, + {"piece", piece_json} + }); + } + } else { + tokens_response = tokens; + } } - const json data = format_tokenizer_response(tokens); + + const json data = format_tokenizer_response(tokens_response); res_ok(res, data); }; @@ -3065,7 +2957,7 @@ int main(int argc, char ** argv) { for (const auto & res : results) { responses.push_back(res.data); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); error = true; }); @@ -3084,12 +2976,12 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); - for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { - auto & la = ctx_server.lora_adapters[i]; + for (size_t i = 0; i < ctx_server.loras.size(); ++i) { + auto & lora = ctx_server.loras[i]; result.push_back({ {"id", i}, - {"path", la.path}, - {"scale", la.scale}, + {"path", lora.path}, + {"scale", lora.scale}, }); } res_ok(res, result); @@ -3098,11 +2990,11 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { const std::vector body = json::parse(req.body); - int max_idx = ctx_server.lora_adapters.size(); + int max_idx = ctx_server.loras.size(); // clear existing value - for (auto & la : ctx_server.lora_adapters) { - la.scale = 0.0f; + for (auto & lora : ctx_server.loras) { + lora.scale = 0.0f; } // set value @@ -3110,7 +3002,7 @@ int main(int argc, char ** argv) { int id = entry.at("id"); float scale = entry.at("scale"); if (0 <= id && id < max_idx) { - ctx_server.lora_adapters[id].scale = scale; + ctx_server.loras[id].scale = scale; } else { throw std::runtime_error("invalid adapter id"); } @@ -3205,59 +3097,59 @@ int main(int argc, char ** argv) { // bind HTTP listen port, run the HTTP server in a thread if (!svr->bind_to_port(params.hostname, params.port)) { - LOG_ERROR("couldn't bind HTTP server socket", { - {"hostname", params.hostname}, - {"port", params.port}, - }); + //LOG_ERROR("couldn't bind HTTP server socket", { + // {"hostname", params.hostname}, + // {"port", params.port}, + //}); + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); clean_up(); - LOG_ERROR("exiting due to HTTP server error", {}); return 1; } std::thread t([&]() { svr->listen_after_bind(); }); svr->wait_until_ready(); - LOG_INFO("HTTP server is listening", log_data); + //LOG_INFO("HTTP server is listening", log_data); + LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); // load the model - LOG_INFO("loading model", log_data); + LOG_INF("%s: loading model\n", __func__); + if (!ctx_server.load_model(params)) { clean_up(); t.join(); - LOG_ERROR("exiting due to model loading error", {}); + LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; - } else { - ctx_server.init(); - state.store(SERVER_STATE_READY); - - LOG_INFO("model loaded", {}); - - // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { - LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - params.chat_template = "chatml"; - } - } - - // print sample chat example to make it clear which template is used - { - LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, - {"built_in", params.chat_template.empty()}, - }); - } - - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); - - shutdown_handler = [&](int) { - ctx_server.queue_tasks.terminate(); - }; - ctx_server.queue_tasks.start_loop(); } + ctx_server.init(); + state.store(SERVER_STATE_READY); + + LOG_INF("%s: model loaded\n", __func__); + + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { + if (!ctx_server.validate_model_chat_template()) { + LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); + params.chat_template = "chatml"; + } + } + + // print sample chat example to make it clear which template is used + LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); + + ctx_server.queue_tasks.on_new_task(std::bind( + &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( + &server_context::update_slots, &ctx_server)); + + shutdown_handler = [&](int) { + ctx_server.queue_tasks.terminate(); + }; + + LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + + ctx_server.queue_tasks.start_loop(); + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore new file mode 100644 index 000000000..1d17dae13 --- /dev/null +++ b/examples/server/tests/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 5e6cb277b..10f22c447 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables: | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | | `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | -| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | | `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | ### Run @bug, @wip or @wrong_usage annotated scenario diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b55971454..15e24c624 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -105,6 +105,14 @@ Feature: llama.cpp server Given first token is removed Then tokens can be detokenized + Scenario: Tokenize with pieces + When tokenizing with pieces: + """ + What is the capital of Germany? + 媽 + """ + Then tokens are given with pieces + Scenario: Models available Given available models Then 1 models are supported diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 65b71a8e8..062f084be 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import asyncio import json import os @@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context): context.tokenize_add_special = True +@step("tokenizing with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + context.tokenized_text = context_text(context) + async with aiohttp.ClientSession() as session: + tokenize_args = {"content": context.tokenized_text, "with_pieces": True} + if getattr(context, "tokenize_add_special", None) is not None: + tokenize_args["add_special"] = context.tokenize_add_special + + async with session.post( + f"{context.base_url}/tokenize", json=tokenize_args + ) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens_with_pieces = tokenize_json["tokens"] + + +@step("tokens are given with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + # Verify that the response contains both token IDs and pieces + assert all( + "id" in token and "piece" in token for token in context.tokens_with_pieces + ) + + @step('tokenizing') @async_run_until_complete async def step_tokenize(context): @@ -991,6 +1020,8 @@ async def oai_chat_completions(user_prompt, event_data = line.split(': ', 1) assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' chunk_raw = event_data[1] + if chunk_raw == '[DONE]': + break chunk = json.loads(chunk_raw) assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" @@ -1341,8 +1372,6 @@ def start_server_background(context): server_args.append('--verbose') if context.lora_file: server_args.extend(['--lora', context.lora_file]) - if 'SERVER_LOG_FORMAT_JSON' not in os.environ: - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [context.server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index edfce65b6..537c8a223 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -1,7 +1,8 @@ #pragma once -#include "llama.h" #include "common.h" +#include "log.h" +#include "llama.h" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -15,10 +16,10 @@ #define JSON_ASSERT GGML_ASSERT #include "json.hpp" +#include +#include #include #include -#include -#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -35,32 +36,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -extern bool server_verbose; -extern bool server_log_json; - -#ifndef SERVER_VERBOSE -#define SERVER_VERBOSE 1 -#endif - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (server_verbose) \ - { \ - server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul try { return body.at(key); } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - std::stringstream ss; - ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; - LOG_WARNING(ss.str().c_str(), body); + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); return default_value; } } else { @@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; - - if (server_log_json) { - log.merge_patch({ - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - - if (!extra.empty()) { - log.merge_patch(extra); - } - - printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - - if (!extra.empty()) { - log.merge_patch(extra); - } - std::stringstream ss; - ss << buf << " |"; - for (const auto & el : log.items()) - { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - ss << " " << el.key() << "=" << value; - } - - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - } - fflush(stdout); -} - // // chat template utils // @@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); - LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); + const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); + return formatted_chat; } @@ -243,10 +175,7 @@ static std::string random_string() { } static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - - return chatcmplid.str(); + return "chatcmpl-" + random_string(); } // @@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin return std::string::npos; } -static bool json_is_array_of_numbers(json data) { +static bool json_is_array_of_numbers(const json & data) { if (data.is_array()) { for (const auto & e : data) { if (!e.is_number()) { @@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector unsupported_params { "tools", "tool_choice" }; - for (auto & param : unsupported_params) { + for (const auto & param : unsupported_params) { if (body.contains(param)) { throw std::runtime_error("Unsupported param: " + param); } @@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { +static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { bool stopped_word = result.count("stopped_word") != 0; bool stopped_eos = json_value(result, "stopped_eos", false); int num_tokens_predicted = json_value(result, "tokens_predicted", 0); @@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c {"id", completion_id} }; - if (server_verbose) { + // extra fields for debugging purposes + if (verbose) { res["__verbose"] = result; } @@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { +static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { return std::vector({result}); } @@ -595,7 +523,7 @@ static std::vector format_partial_response_oaicompat(json result, const st static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; - for (auto & elem : embeddings) { + for (const auto & elem : embeddings) { data.push_back(json{ {"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, @@ -616,7 +544,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_tokenizer_response(const std::vector & tokens) { +static bool is_valid_utf8(const std::string & str) { + const unsigned char* bytes = reinterpret_cast(str.data()); + const unsigned char* end = bytes + str.length(); + + while (bytes < end) { + if (*bytes <= 0x7F) { + // 1-byte sequence (0xxxxxxx) + bytes++; + } else if ((*bytes & 0xE0) == 0xC0) { + // 2-byte sequence (110xxxxx 10xxxxxx) + if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) + return false; + bytes += 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) + return false; + bytes += 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) + return false; + bytes += 4; + } else { + // Invalid UTF-8 lead byte + return false; + } + } + + return true; +} + +static json format_tokenizer_response(const json & tokens) { return json { {"tokens", tokens} }; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index a53cef547..c2b7267c8 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,15 +1,14 @@ +#include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" -#include -#include -#include #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -18,11 +17,12 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } + gpt_init(); + // total length of the sequence including the prompt const int n_predict = params.n_predict; @@ -69,25 +69,24 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); + LOG("\n"); + LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); + LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch with size 512 // we use this object to submit token data for decoding @@ -102,7 +101,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG("%s: llama_decode() failed\n", __func__); return 1; } @@ -116,18 +115,16 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); - - llama_sampler_accept(smpl, new_token_id); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // prepare the next batch @@ -143,23 +140,23 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + LOG("\n"); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_batch_free(batch); llama_sampler_free(smpl); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 8f29b5a2c..fbac21811 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -1,11 +1,16 @@ +#include "arg.h" #include "common.h" +#include "sampling.h" +#include "log.h" #include "llama.h" -#include +#include #include +#include +#include +#include #include #include -#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 @@ -27,13 +32,14 @@ struct seq_draft { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; } + gpt_init(); + if (params.model_draft.empty()) { - fprintf(stderr, "%s: error: --model-draft is required\n", __func__); + LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -46,12 +52,6 @@ int main(int argc, char ** argv) { std::default_random_engine rng(params.sparams.seed); std::uniform_real_distribution<> u_dist; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("speculative", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -80,14 +80,14 @@ int main(int argc, char ** argv) { ctx_dft = llama_init_dft.context; const bool vocab_type_tgt = llama_vocab_type(model_tgt); - LOG("vocab_type tgt: %d\n", vocab_type_tgt); + LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt); const bool vocab_type_dft = llama_vocab_type(model_dft); - LOG("vocab_type dft: %d\n", vocab_type_dft); + LOG_DBG("vocab_type dft: %d\n", vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { - fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__); - fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); + LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__); + LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); return 1; } @@ -97,7 +97,7 @@ int main(int argc, char ** argv) { llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_token_eos(model_tgt) != llama_token_eos(model_dft) ) { - fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__); + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); return 1; } @@ -109,8 +109,8 @@ int main(int argc, char ** argv) { : n_vocab_dft - n_vocab_tgt; if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__); - fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__); + LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return 1; } @@ -119,8 +119,8 @@ int main(int argc, char ** argv) { const char * token_text_tgt = llama_token_get_text(model_tgt, i); const char * token_text_dft = llama_token_get_text(model_dft, i); if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); - fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, + LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); + LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, llama_token_to_piece(ctx_tgt, i).c_str(), llama_token_to_piece(ctx_dft, i).c_str()); return 1; @@ -137,18 +137,16 @@ int main(int argc, char ** argv) { const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); + LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str()); } - fflush(stderr); - const int n_input = inp.size(); const auto t_enc_start = ggml_time_us(); @@ -210,7 +208,7 @@ int main(int argc, char ** argv) { active_seqs.insert(s); const auto & tokens = drafts[s].tokens; - LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); + LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str()); } int i_dft = 0; @@ -253,7 +251,7 @@ int main(int argc, char ** argv) { continue; } - LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); + LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); float r = u_dist(rng); llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true }; @@ -271,7 +269,7 @@ int main(int argc, char ** argv) { break; } } - LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); + LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); if (r <= p_tgt / p_dft) { s_keep = s; accept = true; @@ -279,10 +277,10 @@ int main(int argc, char ** argv) { token_str = llama_token_to_piece(ctx_tgt, token_id); gpt_sampler_accept(smpl, token_id, true); - LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; } else { - LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); drafts[s].active = false; // calculate residual probability @@ -337,7 +335,7 @@ int main(int argc, char ** argv) { if (!accept) { // all drafted tokens were rejected // sample from the target model - LOG("all drafted tokens were rejected, sampling from residual distribution\n"); + LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n"); std::vector probs(dist_tgt.size); for (size_t i = 0; i < dist_tgt.size; ++i) { probs[i] = dist_tgt.data[i].p; @@ -355,13 +353,11 @@ int main(int argc, char ** argv) { // greedy verification // sample from the target model - LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); + LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); gpt_sampler_accept(smpl, token_id, true); - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str()); - token_str = llama_token_to_piece(ctx_tgt, token_id); for (int s = 0; s < n_seq_dft; ++s) { @@ -370,7 +366,7 @@ int main(int argc, char ** argv) { } if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); + LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); s_keep = s; accept = true; @@ -392,26 +388,24 @@ int main(int argc, char ** argv) { ++i_dft; if (params.use_color) { // Color token according to its origin sequence - printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); + LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); } else { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } - fflush(stdout); continue; } else { - printf("%s", token_str.c_str()); - fflush(stdout); + LOG("%s", token_str.c_str()); break; } } } { - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); + LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); // TODO: simplify { - LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); + LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); llama_kv_cache_seq_keep(ctx_dft, s_keep); llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); @@ -438,7 +432,7 @@ int main(int argc, char ** argv) { llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -485,7 +479,7 @@ int main(int argc, char ** argv) { const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", + LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } @@ -494,7 +488,7 @@ int main(int argc, char ** argv) { // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); + LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); @@ -583,7 +577,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); } - // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); + // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); ++n_past_tgt; } @@ -601,23 +595,25 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ndraft:\n\n"); + LOG_INF("\n"); + LOG_INF("draft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_dft); - LOG_TEE("\ntarget:\n\n"); + LOG_INF("\n"); + LOG_INF("target:\n\n"); gpt_perf_print(ctx_tgt, smpl); gpt_sampler_free(smpl); @@ -636,7 +632,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 111366fb0..a8cf0aa64 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -4,33 +4,23 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: MIT -INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" source /opt/intel/oneapi/setvars.sh -if [ $# -gt 0 ]; then - GGML_SYCL_DEVICE=$1 - GGML_SYCL_SINGLE_GPU=1 -else - GGML_SYCL_DEVICE=0 - GGML_SYCL_SINGLE_GPU=0 -fi - #export GGML_SYCL_DEBUG=1 - #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then +INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" +MODEL_FILE=llama-2-7b.Q4_0.gguf +NGL=33 + +if [ $# -gt 0 ]; then + GGML_SYCL_DEVICE=$1 echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none + else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 fi - -#use main GPU only -#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none - -#use multiple GPUs with same max compute units -#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index c817be566..a9af6471f 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -1,11 +1,13 @@ #include "common.h" +//#include "log.h" // TODO: start using log.h #include "llama.h" -#include #include +#include #include #include #include +#include // TODO: remove me #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -13,25 +15,25 @@ #include // For CommandLineToArgvW #endif -static void print_usage_information(const char * argv0, FILE * stream) { - fprintf(stream, "usage: %s [options]\n\n", argv0); - fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); - fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); - fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); - fprintf(stream, "to control the behavior of the tokenizer.\n\n"); - fprintf(stream, " The possible options are:\n"); - fprintf(stream, "\n"); - fprintf(stream, " -h, --help print this help and exit\n"); - fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); - fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); - fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); - fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); - fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); - fprintf(stream, " --stdin read prompt from standard input.\n"); - fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); - fprintf(stream, " --no-parse-special do not parse control tokens.\n"); - fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); - fprintf(stream, " --show-count print the total number of tokens.\n"); +static void print_usage_information(const char * argv0) { + printf("usage: %s [options]\n\n", argv0); + printf("The tokenize program tokenizes a prompt using a given model,\n"); + printf("and prints the resulting tokens to standard output.\n\n"); + printf("It needs a model file, a prompt, and optionally other flags\n"); + printf("to control the behavior of the tokenizer.\n\n"); + printf(" The possible options are:\n"); + printf("\n"); + printf(" -h, --help print this help and exit\n"); + printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n"); + printf(" --ids if given, only print numerical token IDs, and not token strings.\n"); + printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); + printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); + printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); + printf(" --stdin read prompt from standard input.\n"); + printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + printf(" --no-parse-special do not parse control tokens.\n"); + printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n"); + printf(" --show-count print the total number of tokens.\n"); } static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { @@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) { const int argc = argv.size(); if (argc <= 1) { - print_usage_information(argv[0].c_str(), stderr); + print_usage_information(argv[0].c_str()); return 1; } @@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) { for (; iarg < argc; ++iarg) { std::string arg{argv[iarg]}; if (arg == "-h" || arg == "--help") { - print_usage_information(argv[0].c_str(), stdout); + print_usage_information(argv[0].c_str()); return 0; } else if (arg == "--ids") { @@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) { // Start actually doing the tokenizing stuff. ////// -#ifdef LOG_DISABLE_LOGS - disable_logging = true; -#endif - if (disable_logging) { llama_log_set(llama_log_callback_null, NULL); } diff --git a/flake.lock b/flake.lock index 10e1f8a29..0db5ff92a 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1725024810, - "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=", + "lastModified": 1726153070, + "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d", + "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1724819573, - "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=", + "lastModified": 1726062873, + "narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2", + "rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1722555339, - "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", + "lastModified": 1725233747, + "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" } }, "root": { diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 532534bcb..89fdf9d1c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -56,6 +56,15 @@ else() set(GGML_NATIVE_DEFAULT ON) endif() +# defaults +if (NOT GGML_LLAMAFILE_DEFAULT) + set(GGML_LLAMAFILE_DEFAULT OFF) +endif() + +if (NOT GGML_CUDA_GRAPHS_DEFAULT) + set(GGML_CUDA_GRAPHS_DEFAULT OFF) +endif() + # general option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) @@ -110,7 +119,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework" option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) +option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) @@ -127,7 +136,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) -option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) +option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index ca73211fe..031ad1ce2 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device); */ GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); +/** + * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. + * + * @return A pointer to the host buffer type interface. + */ +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); + /** * @brief Retrieves the description of a specific CANN device. * diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 536018b66..a413df357 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -358,6 +358,7 @@ extern "C" { struct ggml_object; struct ggml_context; + struct ggml_cgraph; // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { @@ -563,10 +564,11 @@ extern "C" { }; enum ggml_log_level { - GGML_LOG_LEVEL_ERROR = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4, - GGML_LOG_LEVEL_DEBUG = 5 + GGML_LOG_LEVEL_NONE = 0, + GGML_LOG_LEVEL_INFO = 1, + GGML_LOG_LEVEL_WARN = 2, + GGML_LOG_LEVEL_ERROR = 3, + GGML_LOG_LEVEL_DEBUG = 4, }; enum ggml_tensor_flag { @@ -575,23 +577,9 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, }; - // ggml object - struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - enum ggml_object_type type; - - char padding[4]; - }; - - static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - // n-dimensional tensor struct ggml_tensor { - enum ggml_type type; + enum ggml_type type; GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); @@ -655,7 +643,7 @@ extern "C" { struct ggml_threadpool; // forward declaration, see ggml.c - typedef struct ggml_threadpool * ggml_threadpool_t; + typedef struct ggml_threadpool * ggml_threadpool_t; // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 @@ -671,35 +659,6 @@ extern "C" { void * abort_callback_data; }; - enum ggml_cgraph_eval_order { - GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, - GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, - GGML_CGRAPH_EVAL_ORDER_COUNT - }; - - typedef uint32_t ggml_bitset_t; - - struct ggml_hash_set { - size_t size; - ggml_bitset_t * used; // whether or not the keys are in use i.e. set - struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) - }; - - // computation graph - struct ggml_cgraph { - int size; - int n_nodes; - int n_leafs; - - struct ggml_tensor ** nodes; - struct ggml_tensor ** grads; - struct ggml_tensor ** leafs; - - struct ggml_hash_set visited_hash_set; - - enum ggml_cgraph_eval_order order; - }; - // scratch buffer struct ggml_scratch { size_t offs; @@ -2017,8 +1976,6 @@ extern "C" { typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); - #define GGML_N_TASKS_MAX -1 - GGML_API struct ggml_tensor * ggml_map_custom1( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2088,30 +2045,35 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); // graph allocation in a context - GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false - GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads); - GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1); - GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads - GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false + GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads + GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + + GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i] + GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); + GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); + + GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); - GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -2509,6 +2471,7 @@ extern "C" { GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_riscv_v (void); GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cd2dcd066..527c22c68 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -26,6 +26,9 @@ if (NOT MSVC) endif() endif() +unset(GGML_EXTRA_LIBS_PRIVATE) +unset(GGML_EXTRA_LIBS_PUBLIC) + if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) @@ -35,7 +38,7 @@ if (APPLE AND GGML_ACCELERATE) add_compile_definitions(ACCELERATE_NEW_LAPACK) add_compile_definitions(ACCELERATE_LAPACK_ILP64) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK}) else() message(WARNING "Accelerate framework not found") endif() @@ -87,7 +90,7 @@ if (GGML_METAL) COMMENT "Generate assembly for embedded Metal library" ) - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM}) else() if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: @@ -132,7 +135,7 @@ if (GGML_METAL) ) endif() # GGML_METAL_EMBED_LIBRARY - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} @@ -157,11 +160,11 @@ if (GGML_OPENMP) add_compile_definitions(GGML_USE_OPENMP) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) if (GGML_MUSA) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so") + list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") + list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so") endif() else() message(WARNING "OpenMP not found") @@ -244,8 +247,8 @@ if (GGML_BLAS) set(GGML_HEADERS_BLAS ../include/ggml-blas.h) set(GGML_SOURCES_BLAS ggml-blas.cpp) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES}) + list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -326,7 +329,7 @@ if (GGML_CUDA) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) - if (GGML_CUDA_USE_GRAPHS) + if (GGML_CUDA_GRAPHS) add_compile_definitions(GGML_CUDA_USE_GRAPHS) endif() @@ -368,19 +371,19 @@ if (GGML_CUDA) if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static) + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) endif() endif() else() if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas) + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() endif() @@ -388,9 +391,9 @@ if (GGML_CUDA) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) else() if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... endif() endif() else() @@ -495,7 +498,7 @@ if (GGML_HIPBLAS) if (CXX_IS_HIPCC) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device) + list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device) else() set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) endif() @@ -504,7 +507,7 @@ if (GGML_HIPBLAS) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) + list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas) endif() if (GGML_SYCL) @@ -513,7 +516,8 @@ if (GGML_SYCL) endif() check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) - if ( DEFINED ENV{ONEAPI_ROOT}) + + if (DEFINED ENV{ONEAPI_ROOT}) message(STATUS "Using oneAPI Release SYCL compiler (icpx).") elseif(SUPPORTS_SYCL) message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}. @@ -551,26 +555,29 @@ if (GGML_SYCL) find_package(DNNL) message("-- DNNL found:" ${DNNL_FOUND}) + if (GGML_SYCL_TARGET STREQUAL "INTEL") add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND}) else() add_compile_definitions(GGML_SYCL_DNNL=0) endif() + + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl) + endif() + if (WIN32) find_package(IntelSYCL REQUIRED) find_package(MKL REQUIRED) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) endif() endif() - if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND GGML_EXTRA_LIBS DNNL::dnnl) - endif() endif() if (GGML_RPC) @@ -579,7 +586,7 @@ if (GGML_RPC) list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC) if (WIN32) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32) endif() set(GGML_HEADERS_RPC ../include/ggml-rpc.h) @@ -657,8 +664,8 @@ if (GGML_VULKAN) set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header}) set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source}) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Vulkan not found") endif() @@ -817,8 +824,8 @@ if (GGML_KOMPUTE) list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Kompute not found") endif() @@ -883,9 +890,10 @@ if (GGML_CANN) message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) - set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} ) + list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64) + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() else() @@ -1322,21 +1330,25 @@ if (EMSCRIPTEN) set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128") endif() -target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) -target_include_directories(ggml PUBLIC ../include) +target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) +target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) -target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) +target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump -target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) +list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads) find_library(MATH_LIBRARY m) if (MATH_LIBRARY) if (NOT WIN32 OR NOT GGML_SYCL) - target_link_libraries(ggml PRIVATE ${MATH_LIBRARY}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE m) endif() endif() +list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE) +list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC) +target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC}) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 72cb83c9b..27375d0d7 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -4,6 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include #include diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 713731735..6d99c6bea 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -1,3 +1,4 @@ +#include "ggml-impl.h" #include "ggml-blas.h" #include "ggml-backend-impl.h" diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 06930ba2e..aa315b83f 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -30,6 +30,7 @@ #include #include +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" @@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) { return &ggml_backend_cann_buffer_types[device]; } +/** + * @brief Retrieves the name associated with a CANN host buffer type. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer type context. + * + * @param buft Pointer to the host buffer type context. + * @return Const pointer to the C-style string containing the name. + */ +GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CANN_Host"; + + GGML_UNUSED(buft); +} + +/** + * @brief Retrieves the name associated with a CANN host buffer. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer context. + * + * @param buft Pointer to the host buffer context. + * @return Const pointer to the C-style string containing the name. + */ +GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CANN_Host"; + + GGML_UNUSED(buffer); +} + +/** + * @brief Free resources associated with a CANN host buffer. + * + * This function frees the resources associated with a CANN host buffer, including + * its context. + * + * @param buffer The CANN host buffer to free. + */ +GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { + ACL_CHECK(aclrtFreeHost(buffer->context)); +} + +/** + * @brief Allocates a new CANN host buffer of the specified size. + * + * This function allocates a new CANN host buffer with the given size. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or nullptr if allocation fails. + */ +static void * ggml_cann_host_malloc(size_t size) { + if (getenv("GGML_CANN_NO_PINNED") != nullptr) { + return nullptr; + } + + void * hostPtr = nullptr; + aclError err = aclrtMallocHost((void **) &hostPtr, size); + if (err != ACL_SUCCESS) { + + GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, aclGetRecentErrMsg()); + return nullptr; + } + return hostPtr; +} + +/** + * @brief Allocates a new CANN host buffer of the specified type and size. + * + * @param buft Pointer to the host buffer type context. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. + */ +GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * hostPtr = ggml_cann_host_malloc(size); + + if (hostPtr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cann_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; + + return buffer; +} + +/** + * @brief Interface for managing CANN host buffer types in the GGML backend. + * + * Provides function pointers for allocating, querying properties, and managing + * memory for CANN buffer types in the GGML backend. + */ +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cann_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cann_buffer_type_host; +} + /** * @brief Computes the forward operation for a given tensor using CANN * operations. @@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } - + ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, diff --git a/ggml/src/ggml-cpu-impl.h b/ggml/src/ggml-cpu-impl.h new file mode 100644 index 000000000..5b45155b0 --- /dev/null +++ b/ggml/src/ggml-cpu-impl.h @@ -0,0 +1,614 @@ +#pragma once + +// GGML CPU internal header + +#include "ggml.h" +#include "ggml-impl.h" +#include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ +//#include +#include +#include // memcpy +#include // fabsf + + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_MSC_VER) + +#define m512bh(p) p +#define m512i(p) p + +#else + +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) + +#endif + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. + * This code should vectorize nicely if using modern compilers. + */ +static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { + ggml_bf16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h.bits = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; + return h; +} + +#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) +#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#endif + +// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available +#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __SSE3__ +#define __SSE3__ +#endif +#ifndef __SSSE3__ +#define __SSSE3__ +#endif +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#include +#endif + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#if defined(__ARM_NEON) + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#ifdef _MSC_VER + +typedef uint16_t ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +typedef __fp16 ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif // _MSC_VER + +#if !defined(__aarch64__) + +// 32-bit ARM compatibility + +// vaddlvq_s16 +// vpaddq_s16 +// vpaddq_s32 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 + +inline static int32_t vaddlvq_s16(int16x8_t v) { + int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); + return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct ggml_int16x8x2_t { + int16x8_t val[2]; +} ggml_int16x8x2_t; + +inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { + ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct ggml_uint8x16x2_t { + uint8x16_t val[2]; +} ggml_uint8x16x2_t; + +inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { + ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct ggml_uint8x16x4_t { + uint8x16_t val[4]; +} ggml_uint8x16x4_t; + +inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { + ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct ggml_int8x16x2_t { + int8x16_t val[2]; +} ggml_int8x16x2_t; + +inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { + ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct ggml_int8x16x4_t { + int8x16_t val[4]; +} ggml_int8x16x4_t; + +inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +// NOTE: not tested +inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + uint8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +#else + +#define ggml_int16x8x2_t int16x8x2_t +#define ggml_uint8x16x2_t uint8x16x2_t +#define ggml_uint8x16x4_t uint8x16x4_t +#define ggml_int8x16x2_t int8x16x2_t +#define ggml_int8x16x4_t int8x16x4_t + +#define ggml_vld1q_s16_x2 vld1q_s16_x2 +#define ggml_vld1q_u8_x2 vld1q_u8_x2 +#define ggml_vld1q_u8_x4 vld1q_u8_x4 +#define ggml_vld1q_s8_x2 vld1q_s8_x2 +#define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 + +#endif // !defined(__aarch64__) + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + +#endif // defined(__ARM_NEON) + +#if defined(__ARM_NEON) && !defined(_MSC_VER) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + ggml_fp16_internal_t tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + ggml_fp16_internal_t tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#if defined(__loongarch64) +#if defined(__loongarch_asx) +#include +#endif +#if defined(__loongarch_sx) +#include +#endif +#endif + +#if defined(__loongarch_asx) + +typedef union { + int32_t i; + float f; +} ft_union; + +/* float type data load instructions */ +static __m128 __lsx_vreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static __m256 __lasx_xvreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); +} +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) + +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +// precomputed f32 table for f16 (256 KB) +// defined in ggml.c, initialized in ggml_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 982316f56..54f1a7c2d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1,5 +1,5 @@ #include "ggml-cuda.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cuda/common.cuh" @@ -2552,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + + if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 78d70cd7a..4935f8818 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; + const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index e8a957447..021a25682 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2742,6 +2742,7 @@ struct mmq_args { int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne0; + bool use_stream_k; }; template @@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const dim3 block_nums_xy_tiling(nty, ntx, 1); - const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; - if (!use_stream_k) { + if (!args.use_stream_k) { if (args.ne01 % mmq_y == 0) { constexpr bool need_check = false; mul_mat_q<<>> diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 0d5e953ee..21da63509 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -1,13 +1,15 @@ +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. +// For this reason CUB must be included BEFORE anything else. +#include +using namespace cub; +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #include "sumrows.cuh" #include "sum.cuh" #include -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) -#include -using namespace cub; -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) - void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) size_t tmp_size = 0; diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index e50a103ac..8df571149 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -130,42 +130,3 @@ #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamEndCapture musaStreamEndCapture - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 961f3c67b..833984190 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -1,15 +1,17 @@ #pragma once -#include "ggml.h" - // GGML internal header +#include "ggml.h" + #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ -#include #include -#include // memcpy -#include // fabsf +#include + +#ifdef __cplusplus +extern "C" { +#endif #undef MIN #undef MAX @@ -17,96 +19,6 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#if defined(_MSC_VER) - -#define m512bh(p) p -#define m512i(p) p - -#else - -#define m512bh(p) (__m512bh)(p) -#define m512i(p) (__m512i)(p) - -#endif - -/** - * Converts brain16 to float32. - * - * The bfloat16 floating point format has the following structure: - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───┐ - * 0b0000000000000000 brain16 - * - * Since bf16 has the same number of exponent bits as a 32bit float, - * encoding and decoding numbers becomes relatively straightforward. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───────────────────┐ - * 0b00000000000000000000000000000000 IEEE binary32 - * - * For comparison, the standard fp16 format has fewer exponent bits. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌─┴─┐┌─┴──────┐ - * 0b0000000000000000 IEEE binary16 - * - * @see IEEE 754-2008 - */ -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -/** - * Converts float32 to brain16. - * - * This is binary identical with Google Brain float conversion. - * Floats shall round to nearest even, and NANs shall be quiet. - * Subnormals aren't flushed to zero, except perhaps when used. - * This code should vectorize nicely if using modern compilers. - */ -static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { - ggml_bf16_t h; - union { - float f; - uint32_t i; - } u; - u.f = s; - if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ - h.bits = (u.i >> 16) | 64; /* force to quiet */ - return h; - } - h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; - return h; -} - -#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) -#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) - -#ifdef __cplusplus -extern "C" { -#endif - // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -121,516 +33,10 @@ extern "C" { #endif #endif -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#endif - -// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available -#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __SSE3__ -#define __SSE3__ -#endif -#ifndef __SSSE3__ -#define __SSSE3__ -#endif -#endif - -#if defined(__ARM_FEATURE_SVE) -#include -#include -#endif - -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#ifdef _MSC_VER - -typedef uint16_t ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -typedef __fp16 ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif // _MSC_VER - -#if !defined(__aarch64__) - -// 32-bit ARM compatibility - -// vaddlvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddlvq_s16(int16x8_t v) { - int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); - return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif // !defined(__aarch64__) - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif // !defined(__ARM_FEATURE_DOTPROD) - -#endif // defined(__ARM_NEON) - -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - ggml_fp16_internal_t tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - ggml_fp16_internal_t tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; -} - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#if defined(__loongarch64) -#if defined(__loongarch_asx) -#include -#endif -#if defined(__loongarch_sx) -#include -#endif -#endif - -#if defined(__loongarch_asx) - -typedef union { - int32_t i; - float f; -} ft_union; - -/* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); -} - -static __m256 __lasx_xvreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); -} -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) - -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -extern float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif - -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif - // bitset +typedef uint32_t ggml_bitset_t; + static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) @@ -656,6 +62,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { #define GGML_HASHSET_FULL ((size_t)-1) #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set { + size_t size; + ggml_bitset_t * used; // whether or not the keys are in use i.e. set + struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) +}; + struct ggml_hash_set ggml_hash_set_new(size_t size); void ggml_hash_set_free(struct ggml_hash_set * hash_set); @@ -745,6 +157,30 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g GGML_ABORT("fatal error"); } +// computation graph + +enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT +}; + +struct ggml_cgraph { + int size; + int n_nodes; + int n_leafs; + + struct ggml_tensor ** nodes; + struct ggml_tensor ** grads; + struct ggml_tensor ** leafs; + + struct ggml_hash_set visited_hash_set; + + enum ggml_cgraph_eval_order order; +}; + +struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 41ac63fa4..7f0bd82d5 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-kompute.h" diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f04e5af71..f87181d19 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -1,7 +1,7 @@ #import "ggml-metal.h" +#import "ggml-impl.h" #import "ggml-backend-impl.h" -#import "ggml.h" #import @@ -13,13 +13,16 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) #ifdef GGML_METAL_NDEBUG +#define GGML_METAL_LOG(...) #define GGML_METAL_LOG_INFO(...) #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -882,7 +885,7 @@ static enum ggml_status ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_nodes = gf->n_nodes; + const int n_nodes = gf->n_nodes; const int n_cb = ctx->n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; @@ -3039,8 +3042,7 @@ static enum ggml_status ggml_metal_graph_compute( if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - NSString * error_code = [command_buffer error].localizedDescription; - GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]); + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -3184,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", + GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, @@ -3192,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); } } else { GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", @@ -3225,15 +3225,19 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = nil; + + if (size_aligned > 0) { + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } } - if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { + if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); @@ -3310,14 +3314,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_aligned); @@ -3333,14 +3340,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, for (size_t i = 0; i < size; i += size_step) { const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_step_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_step_aligned); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 8c31e2cca..8bffce860 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include @@ -230,6 +231,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } + +static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { + const __m128i ax = _mm_sign_epi8(x, x); + const __m128i sy = _mm_sign_epi8(y, x); + return _mm_maddubs_epi16(ax, sy); +} #endif #elif defined(__SSSE3__) // horizontally add 4x4 floats @@ -4003,42 +4010,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); - const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + const int vector_length = ggml_sve_cnt_b*8; - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + // VLA Implementation using switch case + switch (vector_length) { + case 128: + { + // predicate for activating higher lanes for 4 float32 elements + const svbool_t ph4 = svptrue_pat_b32(SV_VL4); - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + // 4-bit -> 8-bit + const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F)); + const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04)); + const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F)); + const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04)); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + // sub 8 + const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8); + const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8); + const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8); + const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8); - // dot product - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load y + const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16); + const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs); + const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // dot product + sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx0ls, qy0l), + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx1ls, qy1l), + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 256: + { + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements + const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating higher lanes for 32 int8 elements + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes + const svbool_t pl16 = svnot_b_z(ph32, ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); + const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(ph32, y0->qs); + const svint8_t qy1 = svld1_s8(ph32, y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); + } break; + default: + assert(false && "Unsupported vector length"); + break; } + #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -4107,37 +4213,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc); #elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); + const __m128i mone = _mm_set1_epi16(1); - // Main loop - for (; ib < nb; ++ib) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); - - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); + const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); + const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); + const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); + accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2); } - sumf = hsum_float_8(acc); + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); #elif defined(__SSSE3__) // set constants const __m128i lowMask = _mm_set1_epi8(0xF); @@ -5488,29 +5594,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const int vector_length = ggml_sve_cnt_b*8; - // load x - const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); - const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + //VLA Implemenation for SVE + switch (vector_length) { + case 128: + { + // predicate for activating lanes for 16 Int8 elements + const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); + const svbool_t pl16 = svptrue_pat_b32(SV_VL4); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load x + const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); + const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); + const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); + const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // load y + const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); + const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); + const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); + const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); + + sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); + } break; + case 256: + { + //printf("sve256"); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating high 256 bit + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + // predicate for activating low 256 bit + const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); + + // predicate for activating high lanes for 8 float32 elements + const svbool_t ph8 = svptrue_pat_b32(SV_VL8); + // predicate for activating low lanes for 8 float32 elements + const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); + + svfloat32_t sumv00 = svdup_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits + // and add them to make one 64 element vector + // load x + const svint8_t qx_32 = svld1_s8(ph32, x0->qs); + svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); + + qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); + + // load y + const svint8_t qy_32 = svld1_s8(ph32, y0->qs); + svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); + + qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); + + // scale creation + const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); + + // duplicate deq1 in first half of vector and deq2 in second half of vector + const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); + + const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); + + sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); + } + + sumf = svaddv_f32(svptrue_b32(), sumv00); + break; + } + default: + assert(false && "Unsupported vector length"); + break; } #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -11625,15 +11826,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * #endif } - -#if defined(__AVX__) -static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { - const __m128i ax = _mm_sign_epi8(x, x); - const __m128i sy = _mm_sign_epi8(y, x); - return _mm_maddubs_epi16(ax, sy); -} -#endif - #if defined(__AVX2__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 8f9d0a460..a8a2eb85a 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -1,5 +1,5 @@ #include "ggml-rpc.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include @@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp } result->buffer = reinterpret_cast(tensor->buffer); if (result->buffer && buffers.find(result->buffer) == buffers.end()) { - return nullptr; + result->buffer = nullptr; } - // require that the tensor data does not go beyond the buffer end - uint64_t tensor_size = (uint64_t) ggml_nbytes(result); - uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); - uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); - GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow - GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { @@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vector #include "ggml-sycl.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-sycl/backend.hpp" @@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + return true; case GGML_OP_CONT: + return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: + // TODO: add support for the new F32 operations + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 83737c1d9..bad960510 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -21,7 +21,7 @@ #include #include -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-vulkan-shaders.hpp" diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 28ee46e04..3a8aadae8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2,6 +2,7 @@ #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #include "ggml.h" #include "ggml-aarch64.h" @@ -287,6 +288,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 +#define GGML_N_TASKS_MAX (-1) #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -1120,21 +1122,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - res = GGML_F32x4_REDUCE_ONE(x[0]); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ } #define GGML_F32_VEC GGML_F32x4 @@ -1161,30 +1163,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + #define GGML_F16x8_REDUCE(res, x) \ + do { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ + (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ } while (0) #define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) #define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_MUL GGML_F16x8_MUL @@ -1893,6 +1895,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif +// +// ggml object +// + +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + enum ggml_object_type type; + + char padding[4]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // // ggml context // @@ -3381,7 +3400,7 @@ double ggml_type_sizef(enum ggml_type type) { } GGML_CALL const char * ggml_type_name(enum ggml_type type) { - return type_traits[type].type_name; + return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } GGML_CALL bool ggml_is_quantized(enum ggml_type type) { @@ -3847,7 +3866,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed, ctx->mem_size); + __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } @@ -19161,6 +19180,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { ggml_hash_set_reset(&cgraph->visited_hash_set); } +int ggml_graph_size(struct ggml_cgraph * cgraph) { + return cgraph->size; +} + +struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) { + if (i < 0) { + GGML_ASSERT(cgraph->n_nodes + i >= 0); + return cgraph->nodes[cgraph->n_nodes + i]; + } + + GGML_ASSERT(i < cgraph->n_nodes); + return cgraph->nodes[i]; +} + +struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) { + return cgraph->nodes; +} + +int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { + return cgraph->n_nodes; +} + +void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + GGML_ASSERT(cgraph->size > cgraph->n_nodes); + cgraph->nodes[cgraph->n_nodes] = tensor; + cgraph->n_nodes++; +} + // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -23242,6 +23289,14 @@ int ggml_cpu_has_arm_fma(void) { #endif } +int ggml_cpu_has_riscv_v(void) { +#if defined(__riscv_v_intrinsic) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_metal(void) { #if defined(GGML_USE_METAL) return 1; diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index d0c2bb284..0193a463a 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -50,6 +50,7 @@ #include "sgemm.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER @@ -235,6 +236,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) { } #endif // __AVX512F__ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// CONSTANTS + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; +static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl); +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION @@ -933,6 +942,20 @@ class tinyBLAS_Q0_AVX { return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); } + inline __m256i load(const block_iq4_nl *b) { + return MM256_SET_M128I(load1(b), load0(b)); + } + + inline __m128i load0(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x)); + } + + inline __m128i load1(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4))); + } + inline __m256 updot(__m256i u, __m256i s) { __m256i res; #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) @@ -1159,6 +1182,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_IQ4_NL: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_iq4_nl *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + default: return false; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 988f207db..0e7f86f0a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -210,6 +210,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() @@ -219,6 +220,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + OLMOE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK2 = auto() @@ -364,6 +366,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", @@ -373,6 +376,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK2: "deepseek2", @@ -869,6 +873,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.MINICPM3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GEMMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1010,6 +1031,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], MODEL_ARCH.OPENELM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bc9a13ee5..2ebfa2b43 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron + "model.embed_tokens", # llama-hf nemotron olmoe "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -54,7 +54,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -66,7 +66,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 + "model.norm", # llama-hf baichuan internlm2 olmoe "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -98,7 +98,7 @@ class TensorNameMap: "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf nemotron + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi @@ -142,7 +142,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -154,7 +154,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -167,7 +167,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -185,7 +185,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -229,7 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi @@ -253,7 +253,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), @@ -295,7 +295,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -327,7 +327,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( @@ -367,7 +367,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( @@ -378,7 +378,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -387,7 +387,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm diff --git a/include/llama.h b/include/llama.h index 6334fc30d..cfc8d85dc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,7 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - //bool no_perf; // whether to measure performance timings, TODO: implement + bool no_perf; // whether to measure performance timings // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -1056,6 +1056,9 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); + // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed + LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i); + // available samplers: LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); @@ -1127,15 +1130,20 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); - // Shorthand for: + + // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise + LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); + + /// @details Sample and accept a token from the idx-th output of the last evaluation // + // Shorthand for: // const auto * logits = llama_get_logits_ith(ctx, idx); // llama_token_data_array cur_p = { ... init from logits ... }; // llama_sampler_apply(smpl, &cur_p); - // return cur_p.data[cur_p.selected].id; - // - // At this point, this is mostly a convenience function. - // + // auto token = cur_p.data[cur_p.selected].id; + // llama_sampler_accept(smpl, token); + // return token; + // Returns the sampled token LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); // TODO: extend in the future @@ -1168,13 +1176,30 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - enum llama_perf_type { - LLAMA_PERF_TYPE_CONTEXT = 0, - LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, + struct llama_perf_context_data { + double t_start_ms; + double t_load_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_p_eval; + int32_t n_eval; }; - LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); - LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + struct llama_perf_sampler_data { + double t_sample_ms; + + int32_t n_sample; + }; + + LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); + LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); + LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); + + // NOTE: the following work only with samplers constructed via llama_sampler_chain_init + LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama-impl.h b/src/llama-impl.h index 87012617f..2bde75ec1 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3) void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); +#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 41f48ec28..5275b1d60 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -8,49 +8,45 @@ #include #include #include +#include +#include #include #include #include -static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector & probs) { -#if 1 - probs.resize(cur_p->size); - for (size_t i = 0; i < cur_p->size; ++i) { - probs[i] = cur_p->data[i].p; - } - - std::discrete_distribution dist(probs.begin(), probs.end()); -#else - // avoid the copy with a custom iterator +static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { + // iterator for the probabilities +#ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif struct probs_iterator { typedef std::input_iterator_tag iterator_category; typedef float value_type; typedef float * pointer; typedef float & reference; - typedef size_t difference_type; + typedef ptrdiff_t difference_type; - const llama_token_data_array * data; - size_t i; + const llama_token_data * data; - bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; } - bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; } - float operator*() const { return data->data[i].p; } - probs_iterator & operator++() { ++i; return *this; } - probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; } + bool operator==(const probs_iterator & other) const { return data == other.data; } + bool operator!=(const probs_iterator & other) const { return data != other.data; } + const float & operator*() const { return data->p; } + probs_iterator & operator++() { ++data; return *this; } + probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; } }; + +#ifdef __GNUC__ #pragma GCC diagnostic pop - - std::discrete_distribution dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size}); - - GGML_UNUSED(probs); #endif + std::discrete_distribution dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size}); + return dist(rng); } +/* static void llama_log_softmax(float * array, size_t size) { float max_l = *std::max_element(array, array + size); float sum = 0.f; @@ -64,6 +60,7 @@ static void llama_log_softmax(float * array, size_t size) { array[i] = logf(array[i] / sum); } } +*/ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { GGML_ASSERT(cur_p->size > 0); @@ -166,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +static uint32_t get_rng_seed(uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + // use system clock if std::random_device is not a true RNG + static bool is_rd_prng = std::random_device().entropy() == 0; + if (is_rd_prng) { + return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count(); + } + std::random_device rd; + return rd(); + } + return seed; +} + // llama_sampler API const char * llama_sampler_name(const struct llama_sampler * smpl) { @@ -231,67 +241,92 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; llama_sampler_apply(smpl, &cur_p); - return cur_p.data[cur_p.selected].id; + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; } // sampler chain +static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { + return "chain"; +} + +static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_accept(smpl, token); + } + + chain->n_sample++; +} + +static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_apply(smpl, cur_p); + } +} + +static void llama_sampler_chain_reset(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_reset(smpl); + } + + chain->t_sample_us = 0; + chain->n_sample = 0; +} + +static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { + const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; + + auto * result = llama_sampler_chain_init(chain_src->params); + + for (auto * smpl : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(smpl)); + } + + return result; +} + +static void llama_sampler_chain_free(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_free(smpl); + } + + delete chain; +} + static struct llama_sampler_i llama_sampler_chain_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "chain"; }, - /* .accept = */ [](struct llama_sampler * smpl, llama_token token) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - for (auto * smpl : chain->samplers) { - llama_sampler_accept(smpl, token); - } - - chain->n_sample++; - }, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - for (auto * smpl : chain->samplers) { - llama_sampler_apply(smpl, cur_p); - } - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto * smpl : chain->samplers) { - llama_sampler_reset(smpl); - } - - chain->t_sample_us = 0; - chain->n_sample = 0; - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; - - auto * result = llama_sampler_chain_init(chain_src->params); - - for (auto * smpl : chain_src->samplers) { - llama_sampler_chain_add(result, llama_sampler_clone(smpl)); - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto * smpl : chain->samplers) { - llama_sampler_free(smpl); - } - - delete chain; - }, + /* .name = */ llama_sampler_chain_name, + /* .accept = */ llama_sampler_chain_accept, + /* .apply = */ llama_sampler_chain_apply, + /* .reset = */ llama_sampler_chain_reset, + /* .clone = */ llama_sampler_chain_clone, + /* .free = */ llama_sampler_chain_free, }; struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { @@ -314,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) { const auto * p = (const llama_sampler_chain *) chain->ctx; - if (i < 0 || i >= (int32_t) p->samplers.size()) { + if (i < 0 || (size_t) i >= p->samplers.size()) { return nullptr; } return p->samplers[i]; } +struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) { + auto * p = (llama_sampler_chain *) chain->ctx; + + if (i < 0 || (size_t) i >= p->samplers.size()) { + return nullptr; + } + + auto * result = p->samplers[i]; + p->samplers.erase(p->samplers.begin() + i); + + return result; +} + int llama_sampler_chain_n(const struct llama_sampler * chain) { const auto * p = (const llama_sampler_chain *) chain->ctx; @@ -366,10 +414,9 @@ struct llama_sampler * llama_sampler_init_greedy() { struct llama_sampler_dist { const uint32_t seed; + uint32_t seed_cur; std::mt19937 rng; - - std::vector probs; // work array }; static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) { @@ -378,7 +425,7 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl* static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); } static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { @@ -397,7 +444,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample static void llama_sampler_dist_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_dist_free(struct llama_sampler * smpl) { @@ -414,12 +462,13 @@ static struct llama_sampler_i llama_sampler_dist_i = { }; struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { - /* .seed = */ seed, - /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1014,6 +1063,7 @@ struct llama_sampler_mirostat { const int32_t n_vocab; const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1023,8 +1073,6 @@ struct llama_sampler_mirostat { float mu; std::mt19937 rng; - - std::vector probs; }; static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { @@ -1055,7 +1103,7 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); llama_sampler_softmax_impl(cur_p); - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; @@ -1084,7 +1132,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { @@ -1101,17 +1150,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = { }; struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_i, /* .ctx = */ new llama_sampler_mirostat { - /* .n_vocab = */ n_vocab, - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .m = */ m, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, + /* .n_vocab = */ n_vocab, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .m = */ m, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1120,6 +1170,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see struct llama_sampler_mirostat_v2 { const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1127,8 +1178,6 @@ struct llama_sampler_mirostat_v2 { float mu; std::mt19937 rng; - - std::vector probs; }; static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { @@ -1152,7 +1201,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t // Normalize the probabilities of the remaining words llama_sampler_softmax_impl(cur_p); - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; @@ -1166,7 +1215,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { @@ -1199,15 +1249,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = { }; struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_v2_i, /* .ctx = */ new llama_sampler_mirostat_v2 { - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1493,6 +1544,8 @@ struct llama_sampler * llama_sampler_init_penalties( ignore_eos = false; } + penalty_last_n = std::max(penalty_last_n, 0); + return new llama_sampler { /* .iface = */ &llama_sampler_penalties_i, /* .ctx = */ new llama_sampler_penalties { @@ -1527,6 +1580,10 @@ static const char * llama_sampler_logit_bias_name(const struct llama_sampler * / static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + if (ctx->logit_bias.empty()) { + return; + } + ctx->to_search.clear(); // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) @@ -1538,6 +1595,10 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } + if (ctx->to_search.empty()) { + return; + } + // search for the remaining candidates that were not found in the previous step for (size_t i = 0; i < cur_p->size; ++i) { for (const auto & lb : ctx->to_search) { @@ -1548,6 +1609,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } } + static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); @@ -1579,3 +1641,65 @@ struct llama_sampler * llama_sampler_init_logit_bias( }, }; } + +// utils + +uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { + if (smpl->iface == &llama_sampler_dist_i) { + return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_i) { + return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_v2_i) { + return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_chain_i) { + const auto * ctx = (const llama_sampler_chain *) smpl->ctx; + for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) { + const uint32_t seed = llama_sampler_get_seed(*it); + if (seed != LLAMA_DEFAULT_SEED) { + return seed; + } + } + } + + return LLAMA_DEFAULT_SEED; +} + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * ctx->t_sample_us; + data.n_sample = std::max(0, ctx->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + auto * ctx = (struct llama_sampler_chain *) chain->ctx; + + ctx->t_sample_us = ctx->n_sample = 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index 39e20440e..0da764f9d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -193,6 +193,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_MINICPM3, LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, @@ -201,6 +202,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK2, @@ -241,6 +243,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, @@ -249,6 +252,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, @@ -1034,6 +1038,29 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_MINICPM3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_GEMMA, { @@ -1168,6 +1195,26 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OLMOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_OPENELM, { @@ -2156,6 +2203,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer if (host_buffer) { buft = ggml_backend_sycl_host_buffer_type(); } +#elif defined(GGML_USE_CANN) + if (host_buffer) { + buft = ggml_backend_cann_host_buffer_type(); + } #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #elif defined(GGML_USE_VULKAN) @@ -2248,6 +2299,7 @@ enum e_model { MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, + MODEL_A1_7B, MODEL_A2_7B, MODEL_8x7B, MODEL_8x22B, @@ -2482,6 +2534,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool no_perf; enum llama_pooling_type pooling_type; @@ -5211,6 +5264,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; case MODEL_XL: return "1.5B"; + case MODEL_A1_7B: return "A1.7B"; case MODEL_A2_7B: return "A2.7B"; case MODEL_8x7B: return "8x7B"; case MODEL_8x22B: return "8x22B"; @@ -5385,6 +5439,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + + switch (hparams.n_layer) { + case 62: model.type = e_model::MODEL_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_GROK: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5750,6 +5815,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OLMOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_A1_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_OPENELM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -6657,8 +6730,6 @@ static bool llm_load_tensors( bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - auto & hparams = model.hparams; model.split_mode = split_mode; @@ -6894,6 +6965,54 @@ static bool llm_load_tensors( } } } break; + case LLM_ARCH_MINICPM3: + { + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}); + + layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); + + layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); + + layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + } break; case LLM_ARCH_GROK: { if (n_expert == 0) { @@ -7931,6 +8050,44 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_OLMOE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + GGML_ASSERT(n_expert > 0); + GGML_ASSERT(n_expert_used > 0); + + // MoE branch + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; case LLM_ARCH_OPENELM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -8589,14 +8746,13 @@ static bool llm_load_tensors( } } - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { + model.t_start_us = ggml_time_us(); + try { llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); @@ -8658,6 +8814,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam return -1; } + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; + return 0; } @@ -9258,7 +9418,7 @@ static struct ggml_tensor * llm_build_copy_mask_state( // FIXME: zero-out NANs? states = ggml_mul(ctx, states, state_mask); - // copy states which won't be changed further (between n_seqs and n_rs) + // copy states which won't be changed further (between n_seqs and n_kv) ggml_build_forward_expand(graph, ggml_cpy(ctx, ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), @@ -9411,7 +9571,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * cur, struct ggml_tensor * x_prev, struct ggml_tensor ** wkv_state) { - size_t n_embed = cur->ne[0]; + size_t n_embd = cur->ne[0]; size_t n_seq_tokens = cur->ne[1]; size_t n_seqs = cur->ne[2]; @@ -9422,8 +9582,8 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); @@ -9448,11 +9608,11 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( xxx ); - struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); - struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); - struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); - struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float)); - struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float)); + struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); + struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); struct ggml_tensor * xw = ggml_add( ctx, @@ -9521,7 +9681,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( ) ); - w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); + w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd)); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); @@ -9530,21 +9690,21 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( r = ggml_transpose(ctx, r); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); + cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); cur = ggml_norm(ctx, cur, 64e-5f); // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_mul(ctx, cur, g); cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); + return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); } static struct ggml_tensor * llm_build_rwkv6_channel_mix( @@ -9877,8 +10037,8 @@ struct llm_build_context { struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - inp = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + inp = ggml_graph_node(gf, i); if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { break; } else { @@ -12837,6 +12997,215 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_minicpm3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * rope_factors = build_rope_factors(il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + struct ggml_tensor * q = NULL; + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = llm_build_norm(ctx0, q, hparams, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); + + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); + + // split into {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); + + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed", il); + + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // shared RoPE key + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(k_pe, "k_pe", il); + + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", il); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_gemma() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -13533,6 +13902,134 @@ struct llm_build_context { return gf; } + // based on the build_qwen2moe() function, changes: + // * removed shared experts + // * removed bias + // * added q, k norm + struct ggml_cgraph * build_olmoe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_openelm() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -15377,6 +15874,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_MINICPM3: + { + result = llm.build_minicpm3(); + } break; case LLM_ARCH_GEMMA: { result = llm.build_gemma(); @@ -15409,6 +15910,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_OLMOE: + { + result = llm.build_olmoe(); + } break; case LLM_ARCH_OPENELM: { result = llm.build_openelm(); @@ -15820,7 +16325,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { // clear unused states for (int i = 0; i < n_kv; ++i) { - uint32_t cell_id = i + kv_self.head; + const uint32_t cell_id = i + kv_self.head; llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); @@ -16076,19 +16581,21 @@ static int llama_decode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT + if (batch_all.token) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); @@ -16205,8 +16712,8 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; + struct ggml_tensor * res = ggml_graph_node(gf, -1); + struct ggml_tensor * embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -16215,9 +16722,9 @@ static int llama_decode_internal( } else if (cparams.embeddings) { res = nullptr; // do not extract logits for embedding case embd = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { - embd = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + embd = ggml_graph_node(gf, i); break; } } @@ -16375,19 +16882,21 @@ static int llama_encode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + } + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); @@ -16432,15 +16941,15 @@ static int llama_encode_internal( // there are two cases here if (llama_model_has_decoder(&lctx.model)) { // first case is an encoder-decoder T5 model where embeddings are passed to decoder - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); } else { // second case is an encoder-only T5 model if (cparams.embeddings) { // only output embeddings if required - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + embd = ggml_graph_node(gf, -2); } GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); } @@ -17530,6 +18039,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; @@ -17939,6 +18450,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -18061,9 +18573,9 @@ struct llama_model * llama_load_model_from_file( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); + LLAMA_LOG("."); if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); + LLAMA_LOG("\n"); } } return true; @@ -18149,6 +18661,7 @@ struct llama_context * llama_new_context_with_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -18486,7 +18999,7 @@ struct llama_context * llama_new_context_with_model( // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf)); LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } @@ -18585,6 +19098,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: + case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: @@ -18595,6 +19109,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_CODESHELL: case LLM_ARCH_NEMOTRON: case LLM_ARCH_EXAONE: + case LLM_ARCH_MINICPM3: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here @@ -20067,10 +20582,14 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_eval++; } else if (ctx->n_queued_tokens > 1) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_p_eval += ctx->n_queued_tokens; } @@ -20666,6 +21185,7 @@ const char * llama_print_system_info(void) { s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; @@ -20677,65 +21197,40 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto * p = (const struct llama_context *) ctx; +struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_context_data data = {}; - const double t_start_ms = 1e-3 * p->t_start_us; - const double t_end_ms = 1.00 * ggml_time_ms(); - const double t_load_ms = 1e-3 * p->t_load_us; - const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; - const double t_eval_ms = 1e-3 * p->t_eval_us; - - const int32_t n_p_eval = std::max(0, p->n_p_eval); - const int32_t n_eval = std::max(1, p->n_eval); - - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - - const double t_sampler_ms = 1e-3 * p->t_sample_us; - - const int32_t n_sampler = std::max(0, p->n_sample); - - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); - } break; - default: - GGML_ABORT("invalid perf type"); + if (ctx == nullptr) { + return data; } + + data.t_start_ms = 1e-3 * ctx->t_start_us; + data.t_load_ms = 1e-3 * ctx->t_load_us; + data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; + data.t_eval_ms = 1e-3 * ctx->t_eval_us; + data.n_p_eval = std::max(1, ctx->n_p_eval); + data.n_eval = std::max(1, ctx->n_eval); + + return data; } -void llama_perf_reset(void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - auto * p = (struct llama_context *) ctx; +void llama_perf_context_print(const struct llama_context * ctx) { + const auto data = llama_perf_context(ctx); - p->t_start_us = ggml_time_us(); - p->t_eval_us = p->n_eval = 0; - p->t_p_eval_us = p->n_p_eval = 0; - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + const double t_end_ms = 1e-3 * ggml_time_us(); - p->t_sample_us = p->n_sample = 0; - } break; - default: - GGML_ABORT("invalid perf type"); - } + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +} + +void llama_perf_context_reset(struct llama_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; } void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { @@ -20787,8 +21282,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l if (len < 128) { g_state.log_callback(level, buffer, g_state.log_callback_user_data); } else { - char* buffer2 = new char[len+1]; - vsnprintf(buffer2, len+1, format, args_copy); + char * buffer2 = new char[len + 1]; + vsnprintf(buffer2, len + 1, format, args_copy); buffer2[len] = 0; g_state.log_callback(level, buffer2, g_state.log_callback_user_data); delete[] buffer2; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 30e71cfd4..7dcd3fce8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) # llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-quantize-fns.cpp) llama_target_and_test(test-quantize-perf.cpp) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 9ad91acc0..e07d09733 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -1,3 +1,6 @@ +#include "arg.h" +#include "common.h" + #include #include #include @@ -6,18 +9,16 @@ #undef NDEBUG #include -#include "common.h" - int main(void) { gpt_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { try { - auto options = gpt_params_parser_init(params, (enum llama_example)ex); + auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex); std::unordered_set seen_args; std::unordered_set seen_env_vars; - for (const auto & opt : options) { + for (const auto & opt : ctx_arg.options) { // check for args duplications for (const auto & arg : opt.args) { if (seen_args.find(arg) == seen_args.end()) { @@ -52,40 +53,51 @@ int main(void) { }; std::vector argv; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); printf("test-arg-parser: test invalid usage\n\n"); + // missing value argv = {"binary_name", "-m"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // wrong value (int) argv = {"binary_name", "-ngl", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // wrong value (enum) argv = {"binary_name", "-sm", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + // non-existence arg in specific example (--draft cannot be used outside llama-speculative) + argv = {"binary_name", "--draft", "123"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); printf("test-arg-parser: test valid usage\n\n"); argv = {"binary_name", "-m", "model_file.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "model_file.gguf"); argv = {"binary_name", "-t", "1234"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.cpuparams.n_threads == 1234); argv = {"binary_name", "--verbose"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); - assert(params.verbosity == 1); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.verbosity > 1); argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "abc.gguf"); assert(params.n_predict == 6789); assert(params.n_batch == 9090); + // --draft cannot be used outside llama-speculative + argv = {"binary_name", "--draft", "123"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); + assert(params.n_draft == 123); + // skip this part on windows, because setenv is not supported #ifdef _WIN32 printf("test-arg-parser: skip on windows build\n"); @@ -94,12 +106,12 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "blah", true); argv = {"binary_name"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); @@ -109,7 +121,7 @@ int main(void) { setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name", "-m", "overwritten.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 635de01d7..aa7896def 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -519,7 +519,7 @@ struct test_case { // add sentinels as graph nodes so that they are checked in the callback for (ggml_tensor * sentinel : sentinels) { - gf->nodes[gf->n_nodes++] = sentinel; + ggml_graph_add_node(gf, sentinel); } // randomize tensors @@ -679,9 +679,9 @@ struct test_case { // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU - int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; + int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; for (int i = 1; i < n_runs; i++) { - gf->nodes[gf->n_nodes++] = out; + ggml_graph_add_node(gf, out); } // calculate memory @@ -696,11 +696,11 @@ struct test_case { } return size; }; - for (int i = 0; i < gf->n_nodes; i++) { - if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) { continue; } - mem += tensor_op_size(gf->nodes[i]); + mem += tensor_op_size(ggml_graph_node(gf, i)); } // run @@ -804,7 +804,7 @@ struct test_case { ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, false); if (expect.size() != 1 || expect[0] != 0.0f) { - GGML_ASSERT(gb->n_nodes > gf->n_nodes); + GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); } diff --git a/tests/test-log.cpp b/tests/test-log.cpp new file mode 100644 index 000000000..211222369 --- /dev/null +++ b/tests/test-log.cpp @@ -0,0 +1,39 @@ +#include "log.h" + +#include +#include + +int main() { + const int n_thread = 8; + + std::thread threads[n_thread]; + for (int i = 0; i < n_thread; i++) { + threads[i] = std::thread([i]() { + const int n_msg = 1000; + + for (int j = 0; j < n_msg; j++) { + const int log_type = std::rand() % 4; + + switch (log_type) { + case 0: LOG_INF("Thread %d: %d\n", i, j); break; + case 1: LOG_WRN("Thread %d: %d\n", i, j); break; + case 2: LOG_ERR("Thread %d: %d\n", i, j); break; + case 3: LOG_DBG("Thread %d: %d\n", i, j); break; + default: + break; + } + + if (rand () % 10 < 5) { + gpt_log_set_timestamps(gpt_log_main(), rand() % 2); + gpt_log_set_prefix (gpt_log_main(), rand() % 2); + } + } + }); + } + + for (int i = 0; i < n_thread; i++) { + threads[i].join(); + } + + return 0; +} diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 37400c179..d738b7a45 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -245,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler } } - printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n", + printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n", samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p); }