diff --git a/.devops/tools.sh b/.devops/tools.sh index 24dcfd350..9a86e6ea0 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -8,11 +8,11 @@ arg1="$1" shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - python3 ./convert_hf_to_gguf.py "$@" + exec python3 ./convert_hf_to_gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./llama-quantize "$@" + exec ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./llama-cli "$@" + exec ./llama-cli "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 + exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./llama-server "$@" + exec ./llama-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 886d33d2d..022d31fb2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -662,6 +662,8 @@ jobs: defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + - build: 'llvm-arm64-opencl-adreno' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' steps: - name: Clone @@ -703,6 +705,28 @@ jobs: run: | choco install ninja + - name: Install OpenCL Headers and Libs + id: install_opencl + if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + mkdir build && cd build + cmake .. ` + -DBUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + mkdir build-arm64-release && cd build-arm64-release + cmake .. ` + -A arm64 ` + -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install --config release + - name: Build id: cmake_build run: | @@ -732,7 +756,7 @@ jobs: - name: Test id: cmake_test # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} + if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} run: | cd build ctest -L main -C Release --verbose --timeout 900 diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68c..671fe595c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html diff --git a/CODEOWNERS b/CODEOWNERS index 88ab6de4f..adeba5395 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,3 +1,5 @@ # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs -ci/ @ggerganov +/ci/ @ggerganov +/.devops/ @ngxson +/examples/server/ @ngxson diff --git a/Makefile b/Makefile index e4efb242b..a095462cf 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ BUILD_TARGETS = \ llama-infill \ llama-llava-cli \ llama-minicpmv-cli\ + llama-qwen2vl-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -1411,6 +1412,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual +llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \ + examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/README.md b/README.md index 6fdd8d9ee..49095acc8 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) +- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) @@ -433,6 +434,20 @@ To learn more about model quantization, [read this documentation](examples/quant +## [`llama-run`](examples/run) + +#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3]. + +-
+ Run a model with a specific prompt (by default it's pulled from Ollama registry) + + ```bash + llama-run granite-code + ``` + +
+ +[^3]: [https://github.com/containers/ramalama](RamaLama) ## [`llama-simple`](examples/simple) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3bd75a975..bea32bfbe 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -84,7 +84,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info) # Use curl to download model url if (LLAMA_CURL) find_package(CURL REQUIRED) - add_definitions(-DLLAMA_USE_CURL) + target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) diff --git a/common/arg.cpp b/common/arg.cpp index 0e4bc6549..416a468f8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) { } } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static std::string get_all_kv_cache_types() { + std::ostringstream msg; + for (const auto & type : kv_cache_types) { + msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); + } + return msg.str(); +} + // // CLI argument parsing functions // @@ -591,7 +620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format( + "KV cache data type for K\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_k) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_k = value; + params.cache_type_k = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format( + "KV cache data type for V\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_v) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_v = value; + params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( @@ -1711,6 +1750,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); + add_opt(common_arg( + {"--no-webui"}, + string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params) { + params.webui = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), @@ -2111,35 +2157,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.n_max = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), [](common_params & params, int value) { params.speculative.n_min = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), [](common_params & params, const std::string & value) { params.speculative.p_split = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), [](common_params & params, int value) { params.speculative.n_ctx = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -2159,14 +2205,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.speculative.model = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index 0fb19e79e..8491ad9a4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1054,38 +1054,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "bf16") { - return GGML_TYPE_BF16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } - - throw std::runtime_error("Unsupported cache type: " + s); -} - struct llama_context_params common_context_params_to_llama(const common_params & params) { auto cparams = llama_context_default_params(); @@ -1120,8 +1088,8 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; } - cparams.type_k = kv_cache_type_from_str(params.cache_type_k); - cparams.type_v = kv_cache_type_from_str(params.cache_type_v); + cparams.type_k = params.cache_type_k; + cparams.type_v = params.cache_type_v; return cparams; } @@ -1147,12 +1115,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 - -static bool starts_with(const std::string & str, const std::string & prefix) { - // While we wait for C++20's std::string::starts_with... - return str.rfind(prefix, 0) == 0; -} - static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { int remaining_attempts = max_attempts; diff --git a/common/common.h b/common/common.h index 92cd4e9c1..a7aeda5cf 100644 --- a/common/common.h +++ b/common/common.h @@ -43,9 +43,9 @@ using llama_tokens = std::vector; // build info extern int LLAMA_BUILD_NUMBER; -extern char const * LLAMA_COMMIT; -extern char const * LLAMA_COMPILER; -extern char const * LLAMA_BUILD_TARGET; +extern const char * LLAMA_COMMIT; +extern const char * LLAMA_COMPILER; +extern const char * LLAMA_BUILD_TARGET; struct common_control_vector_load_info; @@ -293,8 +293,8 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data - std::string cache_type_k = "f16"; // KV cache data type for the K - std::string cache_type_v = "f16"; // KV cache data type for the V + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector // NOLINT @@ -446,6 +446,11 @@ std::vector string_split(const std::string & input, ch return parts; } +static bool string_starts_with(const std::string & str, + const std::string & prefix) { // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c63d929c1..206ab502e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2001,6 +2001,29 @@ class Qwen2Model(Model): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) +@Model.register("Qwen2VLForConditionalGeneration") +class Qwen2VLModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, data in super().get_tensors(): + if name.startswith("visual."): + continue + yield name, data + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9210e9fea..21b31392e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,7 +20,12 @@ else() add_subdirectory(batched) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(gbnf-validator) + + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(gbnf-validator) + endif() + add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -46,12 +51,16 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + add_subdirectory(gen-docs) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(cvector-generator) add_subdirectory(export-lora) - add_subdirectory(quantize-stats) + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(quantize-stats) + endif() add_subdirectory(llava) if (GGML_RPC) add_subdirectory(rpc) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 7e62657e1..75f63f938 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -287,7 +287,7 @@ struct split_strategy { } void print_info() { - printf("n_split: %ld\n", ctx_outs.size()); + printf("n_split: %zu\n", ctx_outs.size()); int i_split = 0; for (auto & ctx_out : ctx_outs) { // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) @@ -297,7 +297,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bac606f47..2338ad106 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) { for (const auto & inst : params_instances) { params_idx++; if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); } // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) { // warmup run if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); } test_gen(ctx, 1, t.n_threads); } @@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 5d32f377f..3ce0d60c8 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-qwen2vl-cli) +add_executable(${TARGET} qwen2vl-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d7c94352b..ba28c07c6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" +#define KEY_USE_SILU "clip.use_silu" #define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_BLOCK "clip.%s.block_count" @@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) { #define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat +#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" @@ -163,6 +166,7 @@ enum projector_type { PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_UNKNOWN, }; @@ -171,6 +175,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, }; @@ -463,7 +468,8 @@ struct clip_vision_model { // embeddings struct ggml_tensor * class_embedding; - struct ggml_tensor * patch_embeddings; + struct ggml_tensor * patch_embeddings_0; + struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) struct ggml_tensor * patch_bias; struct ggml_tensor * position_embeddings; @@ -553,6 +559,7 @@ struct clip_ctx { bool has_vision_encoder = false; bool has_llava_projector = false; bool has_minicpmv_projector = false; + bool has_qwen2vl_merger = false; int minicpmv_version = 2; struct clip_vision_model vision_model; @@ -561,6 +568,7 @@ struct clip_ctx { float image_mean[3]; float image_std[3]; bool use_gelu = false; + bool use_silu = false; int32_t ftype = 1; bool has_class_embedding = true; @@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 image_size_height = imgs->data->ny; } } + else if (ctx->has_qwen2vl_merger) { + // use the image's native resolution when image is avaible + if (is_inf) { + // if (imgs->data->nx && imgs->data->ny) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int patches_w = image_size_width / patch_size; + const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; int n_layer = hparams.n_layer; const float eps = hparams.eps; + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; const int batch_size = imgs->size; @@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + if (ctx->has_qwen2vl_merger) { + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + GGML_ASSERT(image_size_height % (patch_size * 2) == 0); + + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, patches_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + hidden_size, patches_w * patches_h, batch_size); + } + else { + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + } if (ctx->has_patch_bias) { // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); @@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } } - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); ggml_set_name(positions, "positions"); ggml_set_input(positions); - embeddings = - ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding + embeddings = + ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + } if (ctx->has_minicpmv_projector) { int pos_w = image_size_width/patch_size; @@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers - if (ctx->has_minicpmv_projector) { + if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) { + // TODO: figure out why we doing thing in this way ??? n_layer += 1; } for (int il = 0; il < n_layer - 1; il++) { @@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + if (ctx->has_qwen2vl_merger) { + Q = ggml_rope_multi( + ctx0, Q, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + } + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); @@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + if (ctx->has_qwen2vl_merger) { + K = ggml_rope_multi( + ctx0, K, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); @@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); + } else if (ctx->use_silu) { + cur = ggml_silu_inplace(ctx0, cur); } else { cur = ggml_gelu_quick_inplace(ctx0, cur); } @@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; + } // post-layernorm @@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ASSERT(false); } } + else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); } + idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER); + if (idx != -1) { + new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx); + } // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search GGML_ASSERT(new_clip->has_vision_encoder); @@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { idx = get_key_idx(ctx, KEY_USE_GELU); new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + try { + idx = get_key_idx(ctx, KEY_USE_SILU); + new_clip->use_silu = gguf_get_val_bool(ctx, idx); + } catch (std::runtime_error & /*e*/) { + new_clip->use_silu = false; + } + if (verbosity >= 1) { LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); @@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } try { - vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { LOG_ERR("%s: failed to load vision model tensors\n", __func__); } + try { + vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1); + } catch(const std::exception& /*e*/) { + new_clip->has_qwen2vl_merger = false; + } // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { @@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; + batch.data = nullptr; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); @@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size ctx_clip->load_image_size = load_image_size; } +struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { + return ctx_clip->load_image_size; +} + struct clip_image_size * clip_image_size_init() { struct clip_image_size * load_image_size = new struct clip_image_size(); load_image_size->width = 448; @@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } return true; } + else if (ctx->has_qwen2vl_merger) { + clip_image_u8 * resized = clip_image_u8_init(); + auto patch_size = clip_patch_size(ctx) * 2; + int nx = ceil((float)img->nx / patch_size) * patch_size; + int ny = ceil((float)img->ny / patch_size) * patch_size; + bicubic_resize(*img, *resized, nx, ny); + + res_imgs->data = new clip_image_f32[1]; + // clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); + // res_imgs->data[0] = *res; + res_imgs->size = 1; + + // clip_image_f32_free(res); + clip_image_u8_free(resized); + return true; + } bool pad_to_square = true; if (!ctx->has_vision_encoder) { @@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + int32_t clip_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } @@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) { } int clip_n_patches(const struct clip_ctx * ctx) { + clip_image_f32 img; + img.nx = ctx->vision_model.hparams.image_size; + img.ny = ctx->vision_model.hparams.image_size; + return clip_n_patches_by_img(ctx, &img); +} + +int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); @@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) { else if (ctx->minicpmv_version == 3) { n_patches = 64; } + } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + int patch_size = params.patch_size * 2; + int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); + int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); + n_patches = x_patch * y_patch; } return n_patches; @@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { + if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { image_size_width = imgs->data[0].nx; image_size_height = imgs->data[0].ny; } @@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - if (!ctx->has_minicpmv_projector) { + if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { GGML_ASSERT(nx == image_size && ny == image_size); } @@ -2413,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;ihas_qwen2vl_merger) { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + + int ptr = 0; + for (int y = 0; y < ph; y+=2) + { + for (int x = 0; x < pw; x+=2) + { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions_data[ptr] = y + dy; + positions_data[num_patches + ptr] = x + dx; + positions_data[num_patches * 2 + ptr] = y + dy; + positions_data[num_patches * 3 + ptr] = x + dx; + ptr++; + } + } + } + } + + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); @@ -2444,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); - } - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); } } @@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return 3584; } } + if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + return ctx->vision_model.mm_1_b->ne[0]; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { } return 0; } + +bool clip_is_qwen2vl(const struct clip_ctx * ctx) { + return ctx->has_qwen2vl_merger; +} + + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { + clip_image_f32 clip_img; + clip_img.buf.resize(h * w * 3); + for (int i = 0; i < h*w*3; i++) + { + clip_img.buf[i] = img[i]; + } + clip_img.nx = w; + clip_img.ny = h; + clip_image_encode(ctx, n_threads, &clip_img, vec); + return true; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 78588bdf1..1603edd26 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); @@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_n_patches (const struct clip_ctx * ctx); +CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); +CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); @@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); + +CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); #ifdef __cplusplus } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ca53a0b8..16f30c56c 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (clip_is_minicpmv(ctx_clip)) { + if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { std::vector image_embd_v; image_embd_v.resize(img_res_v.size); struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); int patch_size=14; load_image_size->width = img_res_v.data[i].nx; load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); + bool encoded = false; - int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); - if (has_minicpmv_projector == 2) { - encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - } - else if (has_minicpmv_projector == 3) { + if (clip_is_qwen2vl(ctx_clip)) { encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } + else { + int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); + if (has_minicpmv_projector == 2) { + encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + } + else if (has_minicpmv_projector == 3) { + encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); + } + } + if (!encoded) { LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; @@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); - n_img_pos_out += clip_n_patches(ctx_clip); + std::memcpy( + image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), + image_embd_v[i], + clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); + n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co if (clip_is_minicpmv(ctx_clip)) { num_max_patches = 10; } - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + float * image_embd; + if (clip_is_qwen2vl(ctx_clip)) { + // qwen2vl don't split image into chunks, so `num_max_patches` is not needed. + image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny)); + } else { + image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + } if (!image_embd) { LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py new file mode 100644 index 000000000..464ab80d3 --- /dev/null +++ b/examples/llava/qwen2_vl_surgery.py @@ -0,0 +1,158 @@ +import argparse +from typing import Dict + +import torch +import numpy as np +from gguf import * +from transformers import ( + Qwen2VLForConditionalGeneration, + Qwen2VLProcessor, + AutoProcessor, + Qwen2VLConfig +) + + +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def to_gguf_name(name: str) -> str: + og = name + name = name.replace("text_model", "t").replace("vision_model", "v") + name = name.replace("blocks", "blk").replace("embeddings.", "") + name = name.replace("attn.", "attn_") + name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") + # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") + name = name.replace("norm1", "ln1").replace("norm2", "ln2") + name = name.replace("merger.mlp", 'mm') + print(f"[to_gguf_name] {og} --> {name}") + return name + + +def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]: + vision_model = qwen2vl.visual + tensor_map = {} + for name, ten in vision_model.state_dict().items(): + ten = ten.numpy() + if 'qkv' in name: + if ten.ndim == 2: # weight + c3, _ = ten.shape + else: # bias + c3 = ten.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = ten[:c] + wk = ten[c: c * 2] + wv = ten[c * 2:] + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv + elif 'merger' in name: + if name.endswith("ln_q.weight"): + tensor_map['v.post_ln.weight'] = ten + elif name.endswith("ln_q.bias"): + tensor_map['v.post_ln.bias'] = ten + else: + # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" + tensor_map[to_gguf_name(name)] = ten + elif 'patch_embed.proj.weight' in name: + # NOTE: split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = ten.shape + assert kt == 2, "Current implmentation only support temporal_patch_size of 2" + tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] + tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] + else: + tensor_map[to_gguf_name(f"vision_model.{name}")] = ten + + for new_name, ten in tensor_map.items(): + if ten.ndim <= 1 or new_name.endswith("_norm.weight"): + tensor_map[new_name] = ten.astype(np.float32) + else: + tensor_map[new_name] = ten.astype(dtype) + tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder + return tensor_map + + +def main(args): + if args.data_type == 'fp32': + dtype = torch.float32 + np_dtype = np.float32 + ftype = 0 + elif args.data_type == 'fp16': + dtype = torch.float32 + np_dtype = np.float16 + ftype = 1 + else: + raise ValueError() + + model_name = args.model_name + print("model_name: ", model_name) + qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=dtype, device_map="cpu" + ) + cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] + vcfg = cfg.vision_config + + if os.path.isdir(model_name): + if model_name.endswith(os.sep): + model_name = model_name[:-1] + model_name = os.path.basename(model_name) + fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" + + fout = GGUFWriter(path=fname_out, arch="clip") + fout.add_description("image encoder for Qwen2VL") + + fout.add_file_type(ftype) + fout.add_bool("clip.has_text_encoder", False) + fout.add_bool("clip.has_vision_encoder", True) + fout.add_bool("clip.has_qwen2vl_merger", True) + fout.add_string("clip.projector_type", "qwen2vl_merger") + + print(cfg.vision_config) + if 'silu' in cfg.vision_config.hidden_act.lower(): + fout.add_bool("clip.use_silu", True) + fout.add_bool("clip.use_gelu", False) + elif 'gelu' in cfg.vision_config.hidden_act.lower(): + fout.add_bool("clip.use_silu", False) + fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower()) + else: + raise ValueError() + + tensor_map = find_vision_tensors(qwen2vl, np_dtype) + for name, data in tensor_map.items(): + fout.add_tensor(name, data) + + fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) + fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) + fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder + fout.add_name(model_name) + """ + HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, + it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. + """ + + processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) + fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue] + fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue] + + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + print("save model as: ", fname_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") + parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") + args = parser.parse_args() + main(args) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp new file mode 100644 index 000000000..e86a60280 --- /dev/null +++ b/examples/llava/qwen2vl-cli.cpp @@ -0,0 +1,581 @@ +#include "arg.h" +#include "base64.hpp" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip.h" +#include "llava.h" +#include "llama.h" +#include "ggml.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif +#ifdef NDEBUG +#include "ggml-alloc.h" +#include "ggml-backend.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + + +static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, + int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + const int patch_size = 14 * 2; + const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); + const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); + auto img_tokens = image_embed->n_image_pos; + // llama_pos mrope_pos[img_tokens * 4]; + std::vector mrope_pos; + mrope_pos.resize(img_tokens * 4); + + for (int y = 0; y < ph; y++) + { + for (int x = 0; x < pw; x++) + { + int i = y * pw + x; + mrope_pos[i] = *st_pos_id; + mrope_pos[i + img_tokens] = *st_pos_id + y; + mrope_pos[i + img_tokens * 2] = *st_pos_id + x; + mrope_pos[i + img_tokens * 3] = 0; + } + } + *st_pos_id += std::max(pw, ph); + + int processed = 0; + std::vector batch_mrope_pos; + batch_mrope_pos.resize(img_tokens * 4); + + for (int i = 0; i < img_tokens; i += n_batch) { + int n_eval = img_tokens - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + + // llama_pos batch_mrope_pos[n_eval * 4]; + std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0); + memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos)); + + llama_batch batch = { + int32_t(n_eval), // n_tokens + nullptr, // token + (image_embed->embed+i*n_embd), // embed + batch_mrope_pos.data(), // pos + nullptr, // n_seq_id + nullptr, // seq_id + nullptr, // logits + }; + + if (llama_decode(ctx_llama, batch)) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + *n_past += n_eval; + processed += n_eval; + } + return true; +} + + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past, int * st_pos_id) { + int N = (int) tokens.size(); + std::vector pos; + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + auto batch = llama_batch_get_one(&tokens[i], n_eval); + // TODO: add mrope pos ids somewhere else + pos.resize(batch.n_tokens * 4); + std::fill(pos.begin(), pos.end(), 0); + for (int j = 0; j < batch.n_tokens * 3; j ++) { + pos[j] = *st_pos_id + (j % batch.n_tokens); + } + batch.pos = pos.data(); + + if (llama_decode(ctx_llama, batch)) { + LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + *st_pos_id += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); + eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id); + return true; +} + +static const char * sample(struct common_sampler * smpl, + struct llama_context * ctx_llama, + int * n_past, int * st_pos_id) { + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = common_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past, st_pos_id); + return ret.c_str(); +} + +static const char* IMG_BASE64_TAG_BEGIN = ""; + +static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { + begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); + end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); +} + +static bool prompt_contains_image(const std::string& prompt) { + size_t begin, end; + find_image_tag_in_prompt(prompt, begin, end); + return (begin != std::string::npos); +} + +// replaces the base64 image tag in the prompt with `replacement` +static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { + size_t img_base64_str_start, img_base64_str_end; + find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); + if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + return NULL; + } + + auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN); + auto base64_bytes_count = img_base64_str_end - base64_bytes_start; + auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count ); + + auto required_bytes = base64::required_encode_size(base64_str.size()); + auto img_bytes = std::vector(required_bytes); + base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); + + auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); + if (!embed) { + LOG_ERR("%s: could not load image from base64 string.\n", __func__); + return NULL; + } + + return embed; +} + +static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { + size_t begin, end; + find_image_tag_in_prompt(prompt, begin, end); + if (begin == std::string::npos || end == std::string::npos) { + return prompt; + } + auto pre = prompt.substr(0, begin); + auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END)); + return pre + replacement + post; +} + +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void print_usage(int, char ** argv) { + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { + + // load and preprocess the image + llava_image_embed * embed = NULL; + auto prompt = params->prompt; + if (prompt_contains_image(prompt)) { + if (!params->image.empty()) { + LOG_INF("using base64 encoded image instead of command line image path\n"); + } + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); + if (!embed) { + LOG_ERR("%s: can't load image from prompt\n", __func__); + return NULL; + } + params->prompt = remove_image_from_prompt(prompt); + } else { + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); + if (!embed) { + fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); + return NULL; + } + } + + return embed; +} + +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { + int n_past = 0; + int cur_pos_id = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find("<|vision_start|>"); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } else { + // llava-1.5 native mode + system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>"; + user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n"; + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true); + if (image_embed != nullptr) { + auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip); + qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size); + } + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false); + + // generate the response + + LOG("\n"); + + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); + if (!smpl) { + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + LOG("%s", tmp); + if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + + fflush(stdout); + } + + common_sampler_free(smpl); + LOG("\n"); +} + +static struct llama_model * llava_init(common_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = common_model_params_to_llama(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_ERR("%s: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + + + llama_context_params ctx_params = common_context_params_to_llama(*params); + ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_ERR("%s: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->ctx_clip = ctx_clip; + ctx_llava->model = model; + return ctx_llava; +} + +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +#ifndef NDEBUG + +static void debug_test_mrope_2d() { + // 1. Initialize backend + ggml_backend_t backend = NULL; + std::string backend_name = ""; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + backend_name = "cuda"; + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + backend_name = "cpu"; + } + + // Calculate the size needed to allocate + size_t ctx_size = 0; + ctx_size += 2 * ggml_tensor_overhead(); // tensors + // no need to allocate anything else! + + // 2. Allocate `ggml_context` to store tensor data + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() + }; + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4); + ggml_set_name(pos, "pos"); + ggml_set_input(pos); + + std::vector dummy_q; + dummy_q.resize(128 * 12 * 30); + std::fill(dummy_q.begin(), dummy_q.end(), 0.1); + // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); + + std::vector pos_id; + pos_id.resize(30 * 4); + for (int i = 0; i < 30; i ++) { + pos_id[i] = i; + pos_id[i + 30] = i + 10; + pos_id[i + 60] = i + 20; + pos_id[i + 90] = i + 30; + } + int sections[4] = {32, 32, 0, 0}; + + // 4. Allocate a `ggml_backend_buffer` to store all tensors + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // 5. Copy tensor data from main memory (RAM) to backend buffer + ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw)); + ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos)); + + // 6. Create a `ggml_cgraph` for mul_mat operation + struct ggml_cgraph * gf = NULL; + struct ggml_context * ctx_cgraph = NULL; + + // create a temporally context to build the graph + struct ggml_init_params params0 = { + /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + ctx_cgraph = ggml_init(params0); + gf = ggml_new_graph(ctx_cgraph); + + struct ggml_tensor * result0 = ggml_rope_multi( + ctx_cgraph, inp_raw, pos, nullptr, + 128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1, + 0, 1, 32, 1); + + // Add "result" tensor and all of its dependencies to the cgraph + ggml_build_forward_expand(gf, result0); + + // 7. Create a `ggml_gallocr` for cgraph computation + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + // 9. Run the computation + int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + ggml_backend_graph_compute(backend, gf); + + // 10. Retrieve results (output tensors) + // in this example, output tensor is always the last tensor in the graph + struct ggml_tensor * result = result0; + // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + float * result_data = (float *)malloc(ggml_nbytes(result)); + // because the tensor data is stored in device buffer, we need to copy it back to RAM + ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result)); + const std::string bin_file = "mrope_2d_" + backend_name +".bin"; + std::ofstream outFile(bin_file, std::ios::binary); + + if (outFile.is_open()) { + outFile.write(reinterpret_cast(result_data), ggml_nbytes(result)); + outFile.close(); + std::cout << "Data successfully written to " + bin_file << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } + + free(result_data); + // 11. Free memory and exit + ggml_free(ctx_cgraph); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); +} + +static void debug_dump_img_embed(struct llava_context * ctx_llava) { + int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama)); + int ne = n_embd * 4; + float vals[56 * 56 * 3]; + // float embd[ne]; + std::vector embd; + embd.resize(ne); + + for (int i = 0; i < 56*56; i++) + { + for (int c = 0; c < 3; c++) + vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); + } + + clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data()); + + std::ofstream outFile("img_embed.bin", std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(embd.data()), ne * sizeof(float)); + + outFile.close(); + std::cout << "Data successfully written to mrope.bin" << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } +} + +#endif + + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + return 1; + } + + common_init(); + + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + print_usage(argc, argv); + return 1; + } + + auto * model = llava_init(¶ms); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init llava model\n", __func__); + return 1; + } + + if (prompt_contains_image(params.prompt)) { + auto * ctx_llava = llava_init_context(¶ms, model); + + auto * image_embed = load_image(ctx_llava, ¶ms, ""); + + // process the prompt + process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); + + llama_perf_context_print(ctx_llava->ctx_llama); + llava_image_embed_free(image_embed); + ctx_llava->model = NULL; + llava_free(ctx_llava); +#ifndef NDEBUG + } else if (params.image[0].empty()) { + auto ctx_llava = llava_init_context(¶ms, model); + + debug_test_mrope_2d(); + debug_dump_img_embed(ctx_llava); + + llama_perf_context_print(ctx_llava->ctx_llama); + ctx_llava->model = NULL; + llava_free(ctx_llava); +#endif + } else { + for (auto & image : params.image) { + auto * ctx_llava = llava_init_context(¶ms, model); + + auto * image_embed = load_image(ctx_llava, ¶ms, image); + if (!image_embed) { + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); + return 1; + } + + // process the prompt + process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); + + llama_perf_context_print(ctx_llava->ctx_llama); + llava_image_embed_free(image_embed); + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + } + + llama_free_model(model); + + return 0; +} diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..f9cce7b21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index e78a8596d..23ff4db27 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -143,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - LOG_INF("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %zu\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 52add51ef..0686d6305 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-run) add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/README.md b/examples/run/README.md index 6e926811f..6162658e9 100644 --- a/examples/run/README.md +++ b/examples/run/README.md @@ -3,5 +3,45 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. ```bash -./llama-run Meta-Llama-3.1-8B-Instruct.gguf +llama-run granite-code +... + +```bash +llama-run -h +Description: + Runs a llm + +Usage: + llama-run [options] model [prompt] + +Options: + -c, --context-size + Context size (default: 2048) + -n, --ngl + Number of GPU layers (default: 0) + -h, --help + Show help message + +Commands: + model + Model is a string with an optional prefix of + huggingface:// (hf://), ollama://, https:// or file://. + If no protocol is specified and a file exists in the specified + path, file:// is assumed, otherwise if a file does not exist in + the specified path, ollama:// is assumed. Models that are being + pulled are downloaded with .partial extension while being + downloaded and then renamed as the file without the .partial + extension when complete. + +Examples: + llama-run llama3 + llama-run ollama://granite-code + llama-run ollama://smollm:135m + llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf + llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf + llama-run https://example.com/some-file1.gguf + llama-run some-file2.gguf + llama-run file://some-file3.gguf + llama-run --ngl 99 some-file4.gguf + llama-run --ngl 99 some-file5.gguf Hello World ... diff --git a/examples/run/run.cpp b/examples/run/run.cpp index cac2faefc..834ea8f7b 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,128 +1,350 @@ #if defined(_WIN32) -#include +# include #else -#include +# include #endif -#include +#if defined(LLAMA_USE_CURL) +# include +#endif + +#include #include #include +#include #include #include #include -#include #include +#include "common.h" +#include "json.hpp" #include "llama-cpp.h" -typedef std::unique_ptr char_array_ptr; +#define printe(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) -struct Argument { - std::string flag; - std::string help_text; -}; +class Opt { + public: + int init(int argc, const char ** argv) { + construct_help_str_(); + // Parse arguments + if (parse(argc, argv)) { + printe("Error: Failed to parse arguments.\n"); + help(); + return 1; + } -struct Options { - std::string model_path, prompt_non_interactive; - int ngl = 99; - int n_ctx = 2048; -}; + // If help is requested, show help and exit + if (help_) { + help(); + return 2; + } -class ArgumentParser { - public: - ArgumentParser(const char * program_name) : program_name(program_name) {} - - void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { - string_args[flag] = &var; - arguments.push_back({flag, help_text}); + return 0; // Success } - void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { - int_args[flag] = &var; - arguments.push_back({flag, help_text}); + std::string model_; + std::string user_; + int context_size_ = 2048, ngl_ = -1; + + private: + std::string help_str_; + bool help_ = false; + + void construct_help_str_() { + help_str_ = + "Description:\n" + " Runs a llm\n" + "\n" + "Usage:\n" + " llama-run [options] model [prompt]\n" + "\n" + "Options:\n" + " -c, --context-size \n" + " Context size (default: " + + std::to_string(context_size_); + help_str_ += + ")\n" + " -n, --ngl \n" + " Number of GPU layers (default: " + + std::to_string(ngl_); + help_str_ += + ")\n" + " -h, --help\n" + " Show help message\n" + "\n" + "Commands:\n" + " model\n" + " Model is a string with an optional prefix of \n" + " huggingface:// (hf://), ollama://, https:// or file://.\n" + " If no protocol is specified and a file exists in the specified\n" + " path, file:// is assumed, otherwise if a file does not exist in\n" + " the specified path, ollama:// is assumed. Models that are being\n" + " pulled are downloaded with .partial extension while being\n" + " downloaded and then renamed as the file without the .partial\n" + " extension when complete.\n" + "\n" + "Examples:\n" + " llama-run llama3\n" + " llama-run ollama://granite-code\n" + " llama-run ollama://smollm:135m\n" + " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" + " llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" + " llama-run https://example.com/some-file1.gguf\n" + " llama-run some-file2.gguf\n" + " llama-run file://some-file3.gguf\n" + " llama-run --ngl 99 some-file4.gguf\n" + " llama-run --ngl 99 some-file5.gguf Hello World\n"; } int parse(int argc, const char ** argv) { + int positional_args_i = 0; for (int i = 1; i < argc; ++i) { - std::string arg = argv[i]; - if (string_args.count(arg)) { - if (i + 1 < argc) { - *string_args[arg] = argv[++i]; - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) { + if (i + 1 >= argc) { return 1; } - } else if (int_args.count(arg)) { - if (i + 1 < argc) { - if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { - fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); - print_usage(); - return 1; - } - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + + context_size_ = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) { + if (i + 1 >= argc) { return 1; } + + ngl_ = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + help_ = true; + return 0; + } else if (!positional_args_i) { + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user_ = argv[i]; } else { - fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); - print_usage(); - return 1; + user_ += " " + std::string(argv[i]); } } - if (string_args["-m"]->empty()) { - fprintf(stderr, "error: -m is required\n"); - print_usage(); + return model_.empty(); // model_ is the only required value + } + + void help() const { printf("%s", help_str_.c_str()); } +}; + +struct progress_data { + size_t file_size = 0; + std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); + bool printed = false; +}; + +struct FileDeleter { + void operator()(FILE * file) const { + if (file) { + fclose(file); + } + } +}; + +typedef std::unique_ptr FILE_ptr; + +#ifdef LLAMA_USE_CURL +class CurlWrapper { + public: + int init(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + std::string output_file_partial; + curl = curl_easy_init(); + if (!curl) { return 1; } + progress_data data; + FILE_ptr out; + if (!output_file.empty()) { + output_file_partial = output_file + ".partial"; + out.reset(fopen(output_file_partial.c_str(), "ab")); + } + + set_write_options(response_str, out); + data.file_size = set_resume_point(output_file_partial); + set_progress_options(progress, data); + set_headers(headers); + perform(url); + if (!output_file.empty()) { + std::filesystem::rename(output_file_partial, output_file); + } + return 0; } - private: - const char * program_name; - std::unordered_map string_args; - std::unordered_map int_args; - std::vector arguments; + ~CurlWrapper() { + if (chunk) { + curl_slist_free_all(chunk); + } - int parse_int_arg(const char * arg, int & value) { - char * end; - const long val = std::strtol(arg, &end, 10); - if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { - value = static_cast(val); + if (curl) { + curl_easy_cleanup(curl); + } + } + + private: + CURL * curl = nullptr; + struct curl_slist * chunk = nullptr; + + void set_write_options(std::string * response_str, const FILE_ptr & out) { + if (response_str) { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str); + } else { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get()); + } + } + + size_t set_resume_point(const std::string & output_file) { + size_t file_size = 0; + if (std::filesystem::exists(output_file)) { + file_size = std::filesystem::file_size(output_file); + curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast(file_size)); + } + + return file_size; + } + + void set_progress_options(bool progress, progress_data & data) { + if (progress) { + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); + } + } + + void set_headers(const std::vector & headers) { + if (!headers.empty()) { + if (chunk) { + curl_slist_free_all(chunk); + chunk = 0; + } + + for (const auto & header : headers) { + chunk = curl_slist_append(chunk, header.c_str()); + } + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); + } + } + + void perform(const std::string & url) { + CURLcode res; + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); + res = curl_easy_perform(curl); + if (res != CURLE_OK) { + printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + } + } + + static std::string human_readable_time(double seconds) { + int hrs = static_cast(seconds) / 3600; + int mins = (static_cast(seconds) % 3600) / 60; + int secs = static_cast(seconds) % 60; + + std::ostringstream out; + if (hrs > 0) { + out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0') + << secs << "s"; + } else if (mins > 0) { + out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s"; + } else { + out << secs << "s"; + } + + return out.str(); + } + + static std::string human_readable_size(curl_off_t size) { + static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; + char length = sizeof(suffix) / sizeof(suffix[0]); + int i = 0; + double dbl_size = size; + if (size > 1024) { + for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { + dbl_size = size / 1024.0; + } + } + + std::ostringstream out; + out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i]; + return out.str(); + } + + static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, + curl_off_t) { + progress_data * data = static_cast(ptr); + if (total_to_download <= 0) { return 0; } - return 1; - } - void print_usage() const { - printf("\nUsage:\n"); - printf(" %s [OPTIONS]\n\n", program_name); - printf("Options:\n"); - for (const auto & arg : arguments) { - printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + total_to_download += data->file_size; + const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; + const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download; + const curl_off_t pos = (percentage / 5); + std::string progress_bar; + for (int i = 0; i < 20; ++i) { + progress_bar.append((i < pos) ? "█" : " "); } - printf("\n"); + // Calculate download speed and estimated time to completion + const auto now = std::chrono::steady_clock::now(); + const std::chrono::duration elapsed_seconds = now - data->start_time; + const double speed = now_downloaded / elapsed_seconds.count(); + const double estimated_time = (total_to_download - now_downloaded) / speed; + printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(), + human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(), + speed / (1024 * 1024), human_readable_time(estimated_time).c_str()); + fflush(stderr); + data->printed = true; + + return 0; + } + + // Function to write data to a file + static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { + FILE * out = static_cast(stream); + return fwrite(ptr, size, nmemb, out); + } + + // Function to capture data into a string + static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) { + std::string * str = static_cast(stream); + str->append(static_cast(ptr), size * nmemb); + return size * nmemb; } }; +#endif class LlamaData { - public: - llama_model_ptr model; - llama_sampler_ptr sampler; - llama_context_ptr context; + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; std::vector messages; + std::vector msg_strs; + std::vector fmtted; - int init(const Options & opt) { - model = initialize_model(opt.model_path, opt.ngl); + int init(Opt & opt) { + model = initialize_model(opt); if (!model) { return 1; } - context = initialize_context(model, opt.n_ctx); + context = initialize_context(model, opt.context_size_); if (!context) { return 1; } @@ -131,15 +353,123 @@ class LlamaData { return 0; } - private: - // Initializes the model and returns a unique pointer to it - llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = ngl; + private: +#ifdef LLAMA_USE_CURL + int download(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + CurlWrapper curl; + if (curl.init(url, headers, output_file, progress, response_str)) { + return 1; + } - llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + return 0; + } +#else + int download(const std::string &, const std::vector &, const std::string &, const bool, + std::string * = nullptr) { + printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return 1; + } +#endif + + int huggingface_dl(const std::string & model, const std::vector headers, const std::string & bn) { + // Find the second occurrence of '/' after protocol string + size_t pos = model.find('/'); + pos = model.find('/', pos + 1); + if (pos == std::string::npos) { + return 1; + } + + const std::string hfr = model.substr(0, pos); + const std::string hff = model.substr(pos + 1); + const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; + return download(url, headers, bn, true); + } + + int ollama_dl(std::string & model, const std::vector headers, const std::string & bn) { + if (model.find('/') == std::string::npos) { + model = "library/" + model; + } + + std::string model_tag = "latest"; + size_t colon_pos = model.find(':'); + if (colon_pos != std::string::npos) { + model_tag = model.substr(colon_pos + 1); + model = model.substr(0, colon_pos); + } + + std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag; + std::string manifest_str; + const int ret = download(manifest_url, headers, "", false, &manifest_str); + if (ret) { + return ret; + } + + nlohmann::json manifest = nlohmann::json::parse(manifest_str); + std::string layer; + for (const auto & l : manifest["layers"]) { + if (l["mediaType"] == "application/vnd.ollama.image.model") { + layer = l["digest"]; + break; + } + } + + std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer; + return download(blob_url, headers, bn, true); + } + + std::string basename(const std::string & path) { + const size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return path; + } + + return path.substr(pos + 1); + } + + int remove_proto(std::string & model_) { + const std::string::size_type pos = model_.find("://"); + if (pos == std::string::npos) { + return 1; + } + + model_ = model_.substr(pos + 3); // Skip past "://" + return 0; + } + + int resolve_model(std::string & model_) { + const std::string bn = basename(model_); + const std::vector headers = { "--header", + "Accept: application/vnd.docker.distribution.manifest.v2+json" }; + int ret = 0; + if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) { + remove_proto(model_); + } else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { + remove_proto(model_); + ret = huggingface_dl(model_, headers, bn); + } else if (string_starts_with(model_, "ollama://")) { + remove_proto(model_); + ret = ollama_dl(model_, headers, bn); + } else if (string_starts_with(model_, "https://")) { + download(model_, headers, bn, true); + } else { + ret = ollama_dl(model_, headers, bn); + } + + model_ = bn; + + return ret; + } + + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(Opt & opt) { + ggml_backend_load_all(); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers; + resolve_model(opt.model_); + llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params)); if (!model) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } return model; @@ -148,12 +478,11 @@ class LlamaData { // Initializes the context with the specified parameters llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = n_ctx; - ctx_params.n_batch = n_ctx; - + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); if (!context) { - fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + printe("%s: error: failed to create the llama_context\n", __func__); } return context; @@ -170,23 +499,22 @@ class LlamaData { } }; -// Add a message to `messages` and store its content in `owned_content` -static void add_message(const char * role, const std::string & text, LlamaData & llama_data, - std::vector & owned_content) { - char_array_ptr content(new char[text.size() + 1]); - std::strcpy(content.get(), text.c_str()); - llama_data.messages.push_back({role, content.get()}); - owned_content.push_back(std::move(content)); +// Add a message to `messages` and store its content in `msg_strs` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data) { + llama_data.msg_strs.push_back(std::move(text)); + llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() }); } // Function to apply the chat template and resize `formatted` if needed -static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { - int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); - if (result > static_cast(formatted.size())) { - formatted.resize(result); +static int apply_chat_template(LlamaData & llama_data, const bool append) { + int result = llama_chat_apply_template( + llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append, + append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); + if (append && result > static_cast(llama_data.fmtted.size())) { + llama_data.fmtted.resize(result); result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); + llama_data.messages.size(), append, llama_data.fmtted.data(), + llama_data.fmtted.size()); } return result; @@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr prompt_tokens.resize(n_prompt_tokens); if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { - GGML_ABORT("failed to tokenize the prompt\n"); + printe("failed to tokenize the prompt\n"); + return -1; } return n_prompt_tokens; @@ -207,11 +536,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx = llama_n_ctx(ctx.get()); const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); - fprintf(stderr, "context size exceeded\n"); + printe("context size exceeded\n"); return 1; } @@ -221,9 +550,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch & // convert the token to a string static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { char buf[256]; - int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); if (n < 0) { - GGML_ABORT("failed to convert token to piece\n"); + printe("failed to convert token to piece\n"); + return 1; } piece = std::string(buf, n); @@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { - std::vector prompt_tokens; - const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); - if (n_prompt_tokens < 0) { + std::vector tokens; + if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) { return 1; } // prepare a batch for the prompt - llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; while (true) { check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { - GGML_ABORT("failed to decode\n"); + printe("failed to decode\n"); + return 1; } // sample the next token, check is it an end of generation? @@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str return 0; } -static int parse_arguments(const int argc, const char ** argv, Options & opt) { - ArgumentParser parser(argv[0]); - parser.add_argument("-m", opt.model_path, "model"); - parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); - parser.add_argument("-c", opt.n_ctx, "context_size"); - parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); - if (parser.parse(argc, argv)) { - return 1; - } - - return 0; -} - static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Indicate an error or empty input + return user.empty(); // Should have data in happy path } // Function to generate a response based on the prompt @@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt, // Set response color printf("\033[33m"); if (generate(llama_data, prompt, response)) { - fprintf(stderr, "failed to generate response\n"); + printe("failed to generate response\n"); return 1; } @@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt, } // Helper function to apply the chat template and handle errors -static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, - const bool is_user_input, int & output_length) { - const int new_len = apply_chat_template(llama_data, formatted, is_user_input); +static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) { + const int new_len = apply_chat_template(llama_data, append); if (new_len < 0) { - fprintf(stderr, "failed to apply the chat template\n"); + printe("failed to apply the chat template\n"); return -1; } @@ -319,56 +635,63 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data, } // Helper function to handle user input -static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { - if (!prompt_non_interactive.empty()) { - user_input = prompt_non_interactive; - return true; // No need for interactive input +static int handle_user_input(std::string & user_input, const std::string & user_) { + if (!user_.empty()) { + user_input = user_; + return 0; // No need for interactive input } - printf("\033[32m> \033[0m"); - return !read_user_input(user_input); // Returns false if input ends the loop + printf( + "\r " + "\r\033[32m> \033[0m"); + return read_user_input(user_input); // Returns true if input ends the loop } // Function to tokenize the prompt -static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { - std::vector owned_content; - std::vector fmtted(llama_n_ctx(llama_data.context.get())); +static int chat_loop(LlamaData & llama_data, const std::string & user_) { int prev_len = 0; - + llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); while (true) { // Get user input std::string user_input; - if (!handle_user_input(user_input, prompt_non_interactive)) { - break; + while (handle_user_input(user_input, user_)) { } - add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, - owned_content); - + add_message("user", user_.empty() ? user_input : user_, llama_data); int new_len; - if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) { return 1; } - std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); std::string response; if (generate_response(llama_data, prompt, response)) { return 1; } + + if (!user_.empty()) { + break; + } + + add_message("assistant", response, llama_data); + if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) { + return 1; + } } + return 0; } static void log_callback(const enum ggml_log_level level, const char * text, void *) { if (level == GGML_LOG_LEVEL_ERROR) { - fprintf(stderr, "%s", text); + printe("%s", text); } } static bool is_stdin_a_terminal() { #if defined(_WIN32) HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; + DWORD mode; return GetConsoleMode(hStdin, &mode); #else return isatty(STDIN_FILENO); @@ -382,17 +705,20 @@ static std::string read_pipe_data() { } int main(int argc, const char ** argv) { - Options opt; - if (parse_arguments(argc, argv, opt)) { + Opt opt; + const int ret = opt.init(argc, argv); + if (ret == 2) { + return 0; + } else if (ret) { return 1; } if (!is_stdin_a_terminal()) { - if (!opt.prompt_non_interactive.empty()) { - opt.prompt_non_interactive += "\n\n"; + if (!opt.user_.empty()) { + opt.user_ += "\n\n"; } - opt.prompt_non_interactive += read_pipe_data(); + opt.user_ += read_pipe_data(); } llama_log_set(log_callback, nullptr); @@ -401,7 +727,7 @@ int main(int argc, const char ** argv) { return 1; } - if (chat_loop(llama_data, opt.prompt_non_interactive)) { + if (chat_loop(llama_data, opt.user_)) { return 1; } diff --git a/examples/server/README.md b/examples/server/README.md index 738153981..8e74808be 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | @@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co | -------- | ----------- | | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `-sp, --special` | special tokens output enabled (default: false) | +| `--no-warmup` | skip warming up the model with an empty run | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | @@ -146,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | +| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | @@ -163,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) | -| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) | -| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) | +| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | +| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | -| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | +| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | +| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. @@ -302,23 +304,23 @@ mkdir llama-client cd llama-client ``` -Create a index.js file and put this inside: +Create an index.js file and put this inside: ```javascript -const prompt = `Building a website can be done in 10 simple steps:`; +const prompt = "Building a website can be done in 10 simple steps:" -async function Test() { +async function test() { let response = await fetch("http://127.0.0.1:8080/completion", { - method: 'POST', + method: "POST", body: JSON.stringify({ prompt, - n_predict: 512, + n_predict: 64, }) }) console.log((await response.json()).content) } -Test() +test() ``` And run it: @@ -380,7 +382,7 @@ Multiple prompts are also supported. In this case, the completion result will be `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. -`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. +`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` @@ -441,11 +443,11 @@ These words will not be included in the completion, so make sure to add them to `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` **Response format** -- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. +- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a44..9a19c5e83 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
-
+

Conversations

@@ -204,51 +204,25 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au( {{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
Advanced config
+
+ + Show tokens per second +
+ + + + +