diff --git a/.editorconfig b/.editorconfig index bd525e13f..f88f8da67 100644 --- a/.editorconfig +++ b/.editorconfig @@ -28,4 +28,5 @@ indent_size = 2 indent_style = tab [examples/cvector-generator/*.txt] +trim_trailing_whitespace = unset insert_final_newline = unset diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6244b4812..b3efe0084 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -33,15 +33,13 @@ jobs: - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I - # have disabled them for now until the reason why - # is understood. - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete. + #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } steps: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 6155e9415..311abf02a 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -30,7 +30,7 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] + sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken build_type: [RelWithDebInfo] include: - build_type: Release diff --git a/CMakeLists.txt b/CMakeLists.txt index c90414afa..49ba45356 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example" option(LLAMA_LASX "llama: enable lasx" ON) option(LLAMA_LSX "llama: enable lsx" ON) -# add perf arguments -option(LLAMA_PERF "llama: enable perf" OFF) - # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) @@ -665,6 +662,7 @@ if (LLAMA_SYCL) #todo: AOT find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) message(STATUS "SYCL found") @@ -679,11 +677,9 @@ if (LLAMA_SYCL) endif() add_compile_options(-I./) #include DPCT - add_compile_options(-I/${SYCL_INCLUDE_DIR}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") endif() @@ -693,8 +689,10 @@ if (LLAMA_SYCL) list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") if (LLAMA_SYCL_TARGET STREQUAL "INTEL") set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") @@ -869,10 +867,6 @@ if (LLAMA_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() -if (LLAMA_PERF) - add_compile_definitions(GGML_PERF) -endif() - function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") diff --git a/CMakePresets.json b/CMakePresets.json index e2b7a79e3..fba22af9a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -11,9 +11,21 @@ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, - + { + "name": "sycl-base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_COMPILER": "icx", + "LLAMA_SYCL": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, - { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } }, { @@ -35,15 +47,18 @@ }, { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, - { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] }, - { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] }, + { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, + { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }, + { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, + { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, - { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] }, - { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] } + { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, + { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, + + { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, + { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } ] } diff --git a/Makefile b/Makefile index dddf647cd..3aad77394 100644 --- a/Makefile +++ b/Makefile @@ -344,9 +344,6 @@ ifdef LLAMA_GPROF MK_CFLAGS += -pg MK_CXXFLAGS += -pg endif -ifdef LLAMA_PERF - MK_CPPFLAGS += -DGGML_PERF -endif # Architecture specific # TODO: probably these flags need to be tweaked on some architectures @@ -1051,7 +1048,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/README-sycl.md b/README-sycl.md index bd1984706..b7e2bb12a 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -410,15 +410,9 @@ Output (example): 4. Install build tools -a. Download & install cmake for Windows: https://cmake.org/download/ +a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer) +b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/) -b. Download & install mingw-w64 make for Windows provided by w64devkit - -- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip). - -- Extract `w64devkit` on your pc. - -- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`). ### II. Build llama.cpp @@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release # Option 2: Or FP16 -cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON cmake --build build --config Release -j ``` @@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in .\examples\sycl\win-build-sycl.bat ``` +Or, use CMake presets to build: +```sh +cmake --preset x64-windows-sycl-release +cmake --build build-x64-windows-sycl-release -j --target llama-cli + +cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release +cmake --build build-x64-windows-sycl-release -j --target llama-cli + +cmake --preset x64-windows-sycl-debug +cmake --build build-x64-windows-sycl-debug -j --target llama-cli +``` + +Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. + *Notes:* -- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`. +- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`. ### III. Run the inference diff --git a/common/common.cpp b/common/common.cpp index 657d2ffa8..72098de04 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -6,7 +6,6 @@ #include "llama.h" #include -#include #include #include #include @@ -274,26 +273,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } + bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { const char split_delim = ','; llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; } if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads = std::stoi(argv[i]); if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); @@ -301,10 +296,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tb" || arg == "--threads-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch = std::stoi(argv[i]); if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); @@ -312,10 +304,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-td" || arg == "--threads-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_draft = std::stoi(argv[i]); if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); @@ -323,10 +312,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tbd" || arg == "--threads-batch-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch_draft = std::stoi(argv[i]); if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); @@ -334,10 +320,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.prompt = argv[i]; return true; } @@ -350,10 +333,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--prompt-cache") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.path_prompt_cache = argv[i]; return true; } @@ -366,10 +346,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-bf" || arg == "--binary-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -385,10 +362,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-f" || arg == "--file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -404,10 +378,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--in-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -418,66 +389,42 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_predict = std::stoi(argv[i]); return true; } if (arg == "--top-k") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_k = std::stoi(argv[i]); return true; } if (arg == "-c" || arg == "--ctx-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ctx = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-n" || arg == "-gan") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_n = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-w" || arg == "-gaw") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_w = std::stoi(argv[i]); return true; } if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_base = std::stof(argv[i]); return true; } if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = std::stof(argv[i]); return true; } if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } @@ -486,217 +433,140 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rope-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = 1.0f / std::stof(argv[i]); return true; } if (arg == "--yarn-orig-ctx") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_orig_ctx = std::stoi(argv[i]); return true; } if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_ext_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_attn_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_fast = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_slow = std::stof(argv[i]); return true; } if (arg == "--pooling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } else { invalid_param = true; } return true; } if (arg == "--defrag-thold" || arg == "-dt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.defrag_thold = std::stof(argv[i]); return true; } if (arg == "--samplers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); return true; } if (arg == "--sampling-seq") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); return true; } if (arg == "--top-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_p = std::stof(argv[i]); return true; } if (arg == "--min-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.min_p = std::stof(argv[i]); return true; } if (arg == "--temp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); return true; } if (arg == "--tfs") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.tfs_z = std::stof(argv[i]); return true; } if (arg == "--typical") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.typical_p = std::stof(argv[i]); return true; } if (arg == "--repeat-last-n") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); return true; } if (arg == "--repeat-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_repeat = std::stof(argv[i]); return true; } if (arg == "--frequency-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_freq = std::stof(argv[i]); return true; } if (arg == "--presence-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_present = std::stof(argv[i]); return true; } if (arg == "--dynatemp-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_range = std::stof(argv[i]); return true; } if (arg == "--dynatemp-exp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_exponent = std::stof(argv[i]); return true; } if (arg == "--mirostat") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat = std::stoi(argv[i]); return true; } if (arg == "--mirostat-lr") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_eta = std::stof(argv[i]); return true; } if (arg == "--mirostat-ent") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_tau = std::stof(argv[i]); return true; } if (arg == "--cfg-negative-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_negative_prompt = argv[i]; return true; } if (arg == "--cfg-negative-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -710,203 +580,125 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_scale = std::stof(argv[i]); return true; } if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_batch = std::stoi(argv[i]); return true; } if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ubatch = std::stoi(argv[i]); return true; } if (arg == "--keep") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_keep = std::stoi(argv[i]); return true; } if (arg == "--draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_draft = std::stoi(argv[i]); return true; } if (arg == "--chunks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_chunks = std::stoi(argv[i]); return true; } if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_parallel = std::stoi(argv[i]); return true; } if (arg == "-ns" || arg == "--sequences") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_sequences = std::stoi(argv[i]); return true; } if (arg == "--p-split" || arg == "-ps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.p_split = std::stof(argv[i]); return true; } if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model = argv[i]; return true; } if (arg == "-md" || arg == "--model-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_draft = argv[i]; return true; } if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_alias = argv[i]; return true; } if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_url = argv[i]; return true; } if (arg == "-hfr" || arg == "--hf-repo") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hf_repo = argv[i]; return true; } if (arg == "-hff" || arg == "--hf-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hf_file = argv[i]; return true; } if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; return true; } if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; return true; } if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_base = argv[i]; return true; } if (arg == "--control-vector") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ 1.0f, argv[i], }); return true; } if (arg == "--control-vector-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ std::stof(argv[i]), fname, }); return true; } if (arg == "--control-vector-layer-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_end = std::stoi(argv[i]); return true; } if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.mmproj = argv[i]; return true; } if (arg == "--image") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.image.emplace_back(argv[i]); return true; } @@ -922,6 +714,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.embedding = true; return true; } + if (arg == "--embd-normalize") { + CHECK_ARG + params.embd_normalize = std::stoi(argv[i]); + return true; + } + if (arg == "--embd-output-format") { + CHECK_ARG + params.embd_out = argv[i]; + return true; + } + if (arg == "--embd-separator") { + CHECK_ARG + params.embd_sep = argv[i]; + return true; + } if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; @@ -975,10 +782,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -987,10 +791,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -999,10 +800,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.main_gpu = std::stoi(argv[i]); #ifndef GGML_USE_CUDA_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); @@ -1010,10 +808,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--split-mode" || arg == "-sm") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1038,10 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; // split string by , and / @@ -1066,10 +858,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rpc") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rpc_servers = argv[i]; return true; } @@ -1078,10 +867,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } @@ -1094,10 +880,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--verbosity") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.verbosity = std::stoi(argv[i]); return true; } @@ -1110,18 +893,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.antiprompt.emplace_back(argv[i]); return true; } if (arg == "-ld" || arg == "--logdir") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logdir = argv[i]; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1130,26 +907,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-lcs" || arg == "--lookup-cache-static") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_static = argv[i]; return true; } if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_dynamic = argv[i]; return true; } if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logits_file = argv[i]; return true; } @@ -1158,26 +926,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ppl-stride") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_stride = std::stoi(argv[i]); return true; } if (arg == "--ppl-output-type") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_output_type = std::stoi(argv[i]); return true; } if (arg == "-ptc" || arg == "--print-token-count") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_print = std::stoi(argv[i]); return true; } @@ -1190,10 +949,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--hellaswag-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hellaswag_tasks = std::stoi(argv[i]); return true; } @@ -1202,10 +958,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--winogrande-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.winogrande_tasks = std::stoi(argv[i]); return true; } @@ -1214,10 +967,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--multiple-choice-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.multiple_choice_tasks = std::stoi(argv[i]); return true; } @@ -1234,10 +984,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-l" || arg == "--logit-bias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::stringstream ss(argv[i]); llama_token key; char sign; @@ -1270,34 +1017,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--in-prefix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_prefix = argv[i]; return true; } if (arg == "--in-suffix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_suffix = argv[i]; return true; } if (arg == "--grammar") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.grammar = argv[i]; return true; } if (arg == "--grammar-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1312,18 +1047,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-j" || arg == "--json-schema") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); return true; } if (arg == "--override-kv") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; @@ -1332,42 +1061,27 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hostname = argv[i]; return true; } if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.port = std::stoi(argv[i]); return true; } if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.public_path = argv[i]; return true; } if (arg == "--api-key") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.api_keys.push_back(argv[i]); return true; } if (arg == "--api-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream key_file(argv[i]); if (!key_file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1384,43 +1098,28 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ssl-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_key = argv[i]; return true; } if (arg == "--ssl-cert-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_cert = argv[i]; return true; } if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.timeout_read = std::stoi(argv[i]); params.timeout_write = std::stoi(argv[i]); return true; } if (arg == "--threads-http") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_http = std::stoi(argv[i]); return true; } if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1437,10 +1136,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--log-format") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (std::strcmp(argv[i], "json") == 0) { params.log_json = true; } else if (std::strcmp(argv[i], "text") == 0) { @@ -1460,10 +1156,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-save-path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_save_path = argv[i]; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1472,10 +1165,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chat-template") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!llama_chat_verify_template(argv[i])) { fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); @@ -1486,10 +1176,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-prompt-similarity" || arg == "-sps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_prompt_similarity = std::stof(argv[i]); return true; } @@ -1498,37 +1185,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-npp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); return true; } if (arg == "-ntg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); return true; } if (arg == "-npl") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); return true; } if (arg == "--context-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1539,59 +1214,38 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_size = std::stoi(argv[i]); return true; } if (arg == "--chunk-separator") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_separator = argv[i]; return true; } if (arg == "--junk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_junk = std::stoi(argv[i]); return true; } if (arg == "--pos") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_pos = std::stoi(argv[i]); return true; } if (arg == "-o" || arg == "--output" || arg == "--output-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_out_freq = std::stoi(argv[i]); return true; } if (arg == "--save-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_save_freq = std::stoi(argv[i]); return true; } @@ -1604,59 +1258,38 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk" || arg == "--from-chunk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_chunk = std::stoi(argv[i]); return true; } // cvector params if (arg == "--completions-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_completions_file = argv[i]; return true; } if (arg == "--positive-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_positive_file = argv[i]; return true; } if (arg == "--negative-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_negative_file = argv[i]; return true; } if (arg == "--completions") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_completions = std::stoi(argv[i]); return true; } if (arg == "--pca-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_batch = std::stoi(argv[i]); return true; } if (arg == "--pca-iter") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_iterations = std::stoi(argv[i]); return true; } @@ -1671,10 +1304,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { invalid_param = true; return true; @@ -1870,6 +1500,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "backend" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); + if (llama_supports_mlock()) { options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); } @@ -1943,6 +1574,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); + options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); + options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); + options.push_back({ "server" }); options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); @@ -1988,8 +1624,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", " --completions-file FNAME", "completions file (default: '%s')", params.cvector_completions_file.c_str() }); options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); printf("usage: %s [options]\n", argv[0]); @@ -2664,7 +2300,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat } // Set the output file - std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb"), fclose); + + struct FILE_deleter { + void operator()(FILE * f) const { + fclose(f); + } + }; + + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; @@ -3051,14 +2694,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n) { +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; - for (int i = 0; i < n; i++) { - sum += inp[i] * inp[i]; - } - sum = sqrt(sum); - const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; + switch (embd_norm) { + case -1: // no normalisation + sum = 1.0; + break; + case 0: // max absolute + for (int i = 0; i < n; i++) { + if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + } + sum /= 32760.0; // make an int16 range + break; + case 2: // euclidean + for (int i = 0; i < n; i++) { + sum += inp[i] * inp[i]; + } + sum = std::sqrt(sum); + break; + default: // p-norm (euclidean is p-norm p=2) + for (int i = 0; i < n; i++) { + sum += std::pow(std::abs(inp[i]), embd_norm); + } + sum = std::pow(sum, 1.0 / embd_norm); + break; + } + + const float norm = sum > 0.0 ? 1.0 / sum : 0.0f; for (int i = 0; i < n; i++) { out[i] = inp[i] * norm; @@ -3076,6 +2739,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) sum2 += embd2[i] * embd2[i]; } + // Handle the case where one or both vectors are zero vectors + if (sum1 == 0.0 || sum2 == 0.0) { + if (sum1 == 0.0 && sum2 == 0.0) { + return 1.0f; // two zero vectors are similar + } + return 0.0f; + } + return sum / (sqrt(sum1) * sqrt(sum2)); } diff --git a/common/common.h b/common/common.h index 9a1dc4a2f..a5c738f8b 100644 --- a/common/common.h +++ b/common/common.h @@ -152,7 +152,6 @@ struct gpt_params { bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it - bool embedding = false; // get only sentence embedding bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles @@ -179,6 +178,12 @@ struct gpt_params { std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) + // embedding + bool embedding = false; // get only sentence embedding + int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + std::string embd_sep = "\n"; // separator of embendings + // server params int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds @@ -377,7 +382,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n); +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index fbf1e1ea3..67598b561 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -214,7 +214,7 @@ src_func = f""" """ convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") -convert_py = convert_py_pth.read_text() +convert_py = convert_py_pth.read_text(encoding="utf-8") convert_py = re.sub( r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", lambda m: m.group(1) + src_func + m.group(3), @@ -222,7 +222,7 @@ convert_py = re.sub( flags=re.DOTALL | re.MULTILINE, ) -convert_py_pth.write_text(convert_py) +convert_py_pth.write_text(convert_py, encoding="utf-8") logger.info("+++ convert-hf-to-gguf.py was updated") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c85db5fa4..b56bb8afc 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -65,7 +65,8 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -96,7 +97,8 @@ class Model: ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod def __init_subclass__(cls): @@ -332,6 +334,8 @@ class Model: self.gguf_writer.close() def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -970,7 +974,11 @@ class XverseModel(Model): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() @@ -1403,6 +1411,48 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("BitnetForCausalLM") +class BitnetModel(Model): + model_arch = gguf.MODEL_ARCH.BITNET + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def weight_quant(self, weight): + dtype = weight.dtype + weight = weight.float() + s = 1 / weight.abs().mean().clamp(min=1e-5) + weight = (weight * s).round().clamp(-1, 1) / s + scale = weight.abs().max().unsqueeze(0) + weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) + weight = torch.sign(weight).type(dtype) + return weight.type(dtype), scale.type(torch.float32) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) + weight_torch, scale_torch = self.weight_quant(data_torch) + yield (new_name, weight_torch) + yield (new_name.removesuffix(".weight") + ".scale", scale_torch) + else: + yield (new_name, data_torch) + + @Model.register("GrokForCausalLM") class GrokModel(Model): model_arch = gguf.MODEL_ARCH.GROK @@ -2728,6 +2778,123 @@ class DeepseekV2Model(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("T5ForConditionalGeneration") +@Model.register("T5WithLMHeadModel") +class T5Model(Model): + model_arch = gguf.MODEL_ARCH.T5 + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("T5") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or + # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor + # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight". + if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight": + logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + @Model.register("ChatGLMModel") class ChatGLMModel(Model): model_arch = gguf.MODEL_ARCH.CHATGLM @@ -2914,6 +3081,7 @@ class ChatGLMModel(Model): return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### @@ -2999,10 +3167,44 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) + parser.add_argument( + "--split-max-tensors", type=int, default=0, + help="max tensors in each split", + ) + parser.add_argument( + "--split-max-size", type=str, default="0", + help="max size per split N(M|G)", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out a split plan and exit, without writing any new files", + ) + parser.add_argument( + "--no-tensor-first-split", action="store_true", + help="do not add tensors to the first split (disabled by default)" + ) return parser.parse_args() +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + def main() -> None: args = parse_args() @@ -3035,6 +3237,10 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } + if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"): + logger.error("Error: Cannot use temp file when splitting") + sys.exit(1) + if args.outfile is not None: fname_out = args.outfile else: @@ -3052,7 +3258,10 @@ def main() -> None: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split) logger.info("Set model parameters") model_instance.set_gguf_parameters() @@ -3063,13 +3272,13 @@ def main() -> None: model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) if args.vocab_only: - logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") + logger.info("Exporting model vocab...") model_instance.write_vocab() + logger.info("Model vocab successfully exported.") else: - logger.info(f"Exporting model to '{model_instance.fname_out}'") + logger.info("Exporting model...") model_instance.write() - - logger.info(f"Model successfully exported to '{model_instance.fname_out}'") + logger.info("Model successfully exported.") if __name__ == '__main__': diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index 7b0e79c1f..5182e906d 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -17,7 +17,7 @@ Related PRs: ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 # With advanced options -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 +./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100 # To see help message ./cvector-generator -h diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 9941683db..355905cb0 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -40,7 +40,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); + printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]); printf("\n"); } @@ -377,8 +377,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { // create templated prompts std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); auto format_template = [](std::string persona, std::string suffix) { - // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" - return persona + " " + suffix; + // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] " + return persona + suffix; }; for (size_t i = 0; i < positive_prompts.size(); ++i) { for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt index 2ac3387f1..3e9951752 100644 --- a/examples/cvector-generator/negative.txt +++ b/examples/cvector-generator/negative.txt @@ -1 +1 @@ -[INST] Act like a person who is extremely sad. [/INST] \ No newline at end of file +[INST] Act like a person who is extremely sad. [/INST] diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt index f28e9aa1a..880236787 100644 --- a/examples/cvector-generator/positive.txt +++ b/examples/cvector-generator/positive.txt @@ -1 +1 @@ -[INST] Act like a person who is extremely happy. [/INST] \ No newline at end of file +[INST] Act like a person who is extremely happy. [/INST] diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 2298ec3e7..86df18958 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. + +## extra parameters +### --embd-normalize $integer$ +| $integer$ | description | formula | +|-----------|---------------------|---------| +| $-1$ | none | +| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$ +| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$ +| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$ +| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$ + +### --embd-output-format $'string'$ +| $'string'$ | description | | +|------------|------------------------------|--| +| '' | same as before | (default) +| 'array' | single embeddings | $[[x_1,...,x_n]]$ +| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$ +| 'json' | openai style | +| 'json+' | add cosine similarity matrix | + +### --embd-separator $"string"$ +| $"string"$ | | +|--------------|-| +| "\n" | (default) +| "<#embSep#>" | for exemple +| "<#sep#>" | other exemple + +## examples +### Unix-based systems (Linux, macOS, etc.): + +```bash +./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +``` + +### Windows: + +```powershell +embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +``` + diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 244751e00..1466e5b2b 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -7,23 +7,30 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static std::vector split_lines(const std::string & s) { - std::string line; +static std::vector split_lines(const std::string & s, const std::string & separator = "\n") { std::vector lines; - std::stringstream ss(s); - while (std::getline(ss, line)) { - lines.push_back(line); + size_t start = 0; + size_t end = s.find(separator); + + while (end != std::string::npos) { + lines.push_back(s.substr(start, end - start)); + start = end + separator.length(); + end = s.find(separator, start); } + + lines.push_back(s.substr(start)); // Add the last part + return lines; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); @@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu // try to get sequence embeddings - supported only when pooling_type is not NONE const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); - continue; - } - } + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); float * out = output + batch.seq_id[i][0] * n_embd; - //TODO: I would also add a parameter here to enable normalization or not. - /*fprintf(stdout, "unnormalized_embedding:"); - for (int hh = 0; hh < n_embd; hh++) { - fprintf(stdout, "%9.6f ", embd[hh]); - } - fprintf(stdout, "\n");*/ - llama_embd_normalize(embd, out, n_embd); + llama_embd_normalize(embd, out, n_embd, embd_norm); } } @@ -97,6 +92,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); @@ -109,7 +110,7 @@ int main(int argc, char ** argv) { } // split the prompt into lines - std::vector prompts = split_lines(params.prompt); + std::vector prompts = split_lines(params.prompt, params.embd_sep); // max batch size const uint64_t n_batch = params.n_batch; @@ -169,7 +170,7 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); llama_batch_clear(batch); p += s; s = 0; @@ -182,29 +183,78 @@ int main(int argc, char ** argv) { // final batch float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - // print the first part of the embeddings or for a single prompt, the full embedding - fprintf(stdout, "\n"); - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } + if (params.embd_out.empty()) { + // print the first part of the embeddings or for a single prompt, the full embedding fprintf(stdout, "\n"); - } - - // print cosine similarity matrix - if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } } fprintf(stdout, "\n"); } + + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); + } + fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); + } + } + } + + if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { + const bool notArray = params.embd_out != "array"; + + fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + for (int j = 0;;) { // at least one iteration (one prompt) + if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); + fprintf(stdout, "["); + for (int i = 0;;) { // at least one iteration (n_embd > 0) + fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + i++; + if (i < n_embd) fprintf(stdout, ","); else break; + } + fprintf(stdout, notArray ? "]\n }" : "]"); + j++; + if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; + } + fprintf(stdout, notArray ? "\n ]" : "]\n"); + + if (params.embd_out == "json+" && n_prompts > 1) { + fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + for (int i = 0;;) { // at least two iteration (n_prompts > 1) + fprintf(stdout, " ["); + for (int j = 0;;) { // at least two iteration (n_prompts > 1) + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f", sim); + j++; + if (j < n_prompts) fprintf(stdout, ", "); else break; + } + fprintf(stdout, " ]"); + i++; + if (i < n_prompts) fprintf(stdout, ",\n"); else break; + } + fprintf(stdout, "\n ]"); + } + + if (notArray) fprintf(stdout, "\n}\n"); } // clean up diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 250707cca..683231cc2 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -44,6 +44,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); // run model @@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo const llama_model * mdl = llama_get_model(ctx); llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); + llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); std::vector inputs = llama_tokenize(mdl, prompt, false, true); @@ -165,8 +168,7 @@ int main(int argc, char * argv[]) { llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); - // create new context - set to embedding mode - cparams.embeddings = true; + // create generation context llama_context * ctx = llama_new_context_with_model(mdl, cparams); // ### Embedding/Representation ### diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 5bde18917..2c1e3f61b 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -131,22 +131,29 @@ class LlamaState: ObservableObject { messageLog += "\(text)" - while await llamaContext.n_cur < llamaContext.n_len { - let result = await llamaContext.completion_loop() - messageLog += "\(result)" + Task.detached { + while await llamaContext.n_cur < llamaContext.n_len { + let result = await llamaContext.completion_loop() + await MainActor.run { + self.messageLog += "\(result)" + } + } + + let t_end = DispatchTime.now().uptimeNanoseconds + let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S + let tokens_per_second = Double(await llamaContext.n_len) / t_generation + + await llamaContext.clear() + + await MainActor.run { + self.messageLog += """ + \n + Done + Heat up took \(t_heat)s + Generated \(tokens_per_second) t/s\n + """ + } } - - let t_end = DispatchTime.now().uptimeNanoseconds - let t_generation = Double(t_end - t_heat_end) / NS_PER_S - let tokens_per_second = Double(await llamaContext.n_len) / t_generation - - await llamaContext.clear() - messageLog += """ - \n - Done - Heat up took \(t_heat)s - Generated \(tokens_per_second) t/s\n - """ } func bench() async { diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 28584e14b..76e2052d5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -16,41 +16,41 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 2e43417e1..b9e872cdf 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -73,9 +73,10 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz return chunks; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -160,6 +161,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 19c9f643d..5513e9121 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -634,12 +634,12 @@ return html`
-