diff --git a/.devops/tools.sh b/.devops/tools.sh index 24dcfd350..9a86e6ea0 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -8,11 +8,11 @@ arg1="$1" shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - python3 ./convert_hf_to_gguf.py "$@" + exec python3 ./convert_hf_to_gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./llama-quantize "$@" + exec ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./llama-cli "$@" + exec ./llama-cli "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 + exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./llama-server "$@" + exec ./llama-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 886d33d2d..022d31fb2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -662,6 +662,8 @@ jobs: defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + - build: 'llvm-arm64-opencl-adreno' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' steps: - name: Clone @@ -703,6 +705,28 @@ jobs: run: | choco install ninja + - name: Install OpenCL Headers and Libs + id: install_opencl + if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + mkdir build && cd build + cmake .. ` + -DBUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + mkdir build-arm64-release && cd build-arm64-release + cmake .. ` + -A arm64 ` + -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install --config release + - name: Build id: cmake_build run: | @@ -732,7 +756,7 @@ jobs: - name: Test id: cmake_test # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} + if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} run: | cd build ctest -L main -C Release --verbose --timeout 900 diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68c..671fe595c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html diff --git a/CODEOWNERS b/CODEOWNERS index 88ab6de4f..adeba5395 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,3 +1,5 @@ # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs -ci/ @ggerganov +/ci/ @ggerganov +/.devops/ @ngxson +/examples/server/ @ngxson diff --git a/README.md b/README.md index 6fdd8d9ee..54466c250 100644 --- a/README.md +++ b/README.md @@ -433,6 +433,20 @@ To learn more about model quantization, [read this documentation](examples/quant +## [`llama-run`](examples/run) + +#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3]. + +-
+ Run a model with a specific prompt (by default it's pulled from Ollama registry) + + ```bash + llama-run granite-code + ``` + +
+ +[^3]: [https://github.com/containers/ramalama](RamaLama) ## [`llama-simple`](examples/simple) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 89862fe11..df1cdf9a5 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info) # Use curl to download model url if (LLAMA_CURL) find_package(CURL REQUIRED) - add_definitions(-DLLAMA_USE_CURL) + target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) diff --git a/common/arg.cpp b/common/arg.cpp index 49af31682..39bc874c8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) { } } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static std::string get_all_kv_cache_types() { + std::ostringstream msg; + for (const auto & type : kv_cache_types) { + msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); + } + return msg.str(); +} + // // CLI argument parsing functions // @@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format( + "KV cache data type for K\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_k) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_k = value; + params.cache_type_k = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format( + "KV cache data type for V\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_v) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_v = value; + params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( @@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.n_max = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), [](common_params & params, int value) { params.speculative.n_min = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), [](common_params & params, const std::string & value) { params.speculative.p_split = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), [](common_params & params, int value) { params.speculative.n_ctx = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.speculative.model = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index 6143516d2..3adfb0329 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "bf16") { - return GGML_TYPE_BF16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } - - throw std::runtime_error("Unsupported cache type: " + s); -} - struct llama_context_params common_context_params_to_llama(const common_params & params) { auto cparams = llama_context_default_params(); @@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; } - cparams.type_k = kv_cache_type_from_str(params.cache_type_k); - cparams.type_v = kv_cache_type_from_str(params.cache_type_v); + cparams.type_k = params.cache_type_k; + cparams.type_v = params.cache_type_v; return cparams; } @@ -1108,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 - -static bool starts_with(const std::string & str, const std::string & prefix) { - // While we wait for C++20's std::string::starts_with... - return str.rfind(prefix, 0) == 0; -} - static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { int remaining_attempts = max_attempts; diff --git a/common/common.h b/common/common.h index 95d20401d..9e47b70a4 100644 --- a/common/common.h +++ b/common/common.h @@ -37,9 +37,9 @@ using llama_tokens = std::vector; // build info extern int LLAMA_BUILD_NUMBER; -extern char const * LLAMA_COMMIT; -extern char const * LLAMA_COMPILER; -extern char const * LLAMA_BUILD_TARGET; +extern const char * LLAMA_COMMIT; +extern const char * LLAMA_COMPILER; +extern const char * LLAMA_BUILD_TARGET; struct common_control_vector_load_info; @@ -286,8 +286,8 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data - std::string cache_type_k = "f16"; // KV cache data type for the K - std::string cache_type_v = "f16"; // KV cache data type for the V + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector // NOLINT @@ -437,6 +437,11 @@ std::vector string_split(const std::string & input, ch return parts; } +static bool string_starts_with(const std::string & str, + const std::string & prefix) { // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9210e9fea..21b31392e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,7 +20,12 @@ else() add_subdirectory(batched) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(gbnf-validator) + + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(gbnf-validator) + endif() + add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -46,12 +51,16 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + add_subdirectory(gen-docs) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(cvector-generator) add_subdirectory(export-lora) - add_subdirectory(quantize-stats) + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(quantize-stats) + endif() add_subdirectory(llava) if (GGML_RPC) add_subdirectory(rpc) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 7e62657e1..75f63f938 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -287,7 +287,7 @@ struct split_strategy { } void print_info() { - printf("n_split: %ld\n", ctx_outs.size()); + printf("n_split: %zu\n", ctx_outs.size()); int i_split = 0; for (auto & ctx_out : ctx_outs) { // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) @@ -297,7 +297,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bac606f47..2338ad106 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) { for (const auto & inst : params_instances) { params_idx++; if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); } // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) { // warmup run if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); } test_gen(ctx, 1, t.n_threads); } @@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..f9cce7b21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index e78a8596d..23ff4db27 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -143,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - LOG_INF("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %zu\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 52add51ef..0686d6305 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-run) add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/README.md b/examples/run/README.md index 6e926811f..6162658e9 100644 --- a/examples/run/README.md +++ b/examples/run/README.md @@ -3,5 +3,45 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. ```bash -./llama-run Meta-Llama-3.1-8B-Instruct.gguf +llama-run granite-code +... + +```bash +llama-run -h +Description: + Runs a llm + +Usage: + llama-run [options] model [prompt] + +Options: + -c, --context-size + Context size (default: 2048) + -n, --ngl + Number of GPU layers (default: 0) + -h, --help + Show help message + +Commands: + model + Model is a string with an optional prefix of + huggingface:// (hf://), ollama://, https:// or file://. + If no protocol is specified and a file exists in the specified + path, file:// is assumed, otherwise if a file does not exist in + the specified path, ollama:// is assumed. Models that are being + pulled are downloaded with .partial extension while being + downloaded and then renamed as the file without the .partial + extension when complete. + +Examples: + llama-run llama3 + llama-run ollama://granite-code + llama-run ollama://smollm:135m + llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf + llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf + llama-run https://example.com/some-file1.gguf + llama-run some-file2.gguf + llama-run file://some-file3.gguf + llama-run --ngl 99 some-file4.gguf + llama-run --ngl 99 some-file5.gguf Hello World ... diff --git a/examples/run/run.cpp b/examples/run/run.cpp index cac2faefc..834ea8f7b 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,128 +1,350 @@ #if defined(_WIN32) -#include +# include #else -#include +# include #endif -#include +#if defined(LLAMA_USE_CURL) +# include +#endif + +#include #include #include +#include #include #include #include -#include #include +#include "common.h" +#include "json.hpp" #include "llama-cpp.h" -typedef std::unique_ptr char_array_ptr; +#define printe(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) -struct Argument { - std::string flag; - std::string help_text; -}; +class Opt { + public: + int init(int argc, const char ** argv) { + construct_help_str_(); + // Parse arguments + if (parse(argc, argv)) { + printe("Error: Failed to parse arguments.\n"); + help(); + return 1; + } -struct Options { - std::string model_path, prompt_non_interactive; - int ngl = 99; - int n_ctx = 2048; -}; + // If help is requested, show help and exit + if (help_) { + help(); + return 2; + } -class ArgumentParser { - public: - ArgumentParser(const char * program_name) : program_name(program_name) {} - - void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { - string_args[flag] = &var; - arguments.push_back({flag, help_text}); + return 0; // Success } - void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { - int_args[flag] = &var; - arguments.push_back({flag, help_text}); + std::string model_; + std::string user_; + int context_size_ = 2048, ngl_ = -1; + + private: + std::string help_str_; + bool help_ = false; + + void construct_help_str_() { + help_str_ = + "Description:\n" + " Runs a llm\n" + "\n" + "Usage:\n" + " llama-run [options] model [prompt]\n" + "\n" + "Options:\n" + " -c, --context-size \n" + " Context size (default: " + + std::to_string(context_size_); + help_str_ += + ")\n" + " -n, --ngl \n" + " Number of GPU layers (default: " + + std::to_string(ngl_); + help_str_ += + ")\n" + " -h, --help\n" + " Show help message\n" + "\n" + "Commands:\n" + " model\n" + " Model is a string with an optional prefix of \n" + " huggingface:// (hf://), ollama://, https:// or file://.\n" + " If no protocol is specified and a file exists in the specified\n" + " path, file:// is assumed, otherwise if a file does not exist in\n" + " the specified path, ollama:// is assumed. Models that are being\n" + " pulled are downloaded with .partial extension while being\n" + " downloaded and then renamed as the file without the .partial\n" + " extension when complete.\n" + "\n" + "Examples:\n" + " llama-run llama3\n" + " llama-run ollama://granite-code\n" + " llama-run ollama://smollm:135m\n" + " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" + " llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" + " llama-run https://example.com/some-file1.gguf\n" + " llama-run some-file2.gguf\n" + " llama-run file://some-file3.gguf\n" + " llama-run --ngl 99 some-file4.gguf\n" + " llama-run --ngl 99 some-file5.gguf Hello World\n"; } int parse(int argc, const char ** argv) { + int positional_args_i = 0; for (int i = 1; i < argc; ++i) { - std::string arg = argv[i]; - if (string_args.count(arg)) { - if (i + 1 < argc) { - *string_args[arg] = argv[++i]; - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) { + if (i + 1 >= argc) { return 1; } - } else if (int_args.count(arg)) { - if (i + 1 < argc) { - if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { - fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); - print_usage(); - return 1; - } - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + + context_size_ = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) { + if (i + 1 >= argc) { return 1; } + + ngl_ = std::atoi(argv[++i]); + } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + help_ = true; + return 0; + } else if (!positional_args_i) { + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user_ = argv[i]; } else { - fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); - print_usage(); - return 1; + user_ += " " + std::string(argv[i]); } } - if (string_args["-m"]->empty()) { - fprintf(stderr, "error: -m is required\n"); - print_usage(); + return model_.empty(); // model_ is the only required value + } + + void help() const { printf("%s", help_str_.c_str()); } +}; + +struct progress_data { + size_t file_size = 0; + std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); + bool printed = false; +}; + +struct FileDeleter { + void operator()(FILE * file) const { + if (file) { + fclose(file); + } + } +}; + +typedef std::unique_ptr FILE_ptr; + +#ifdef LLAMA_USE_CURL +class CurlWrapper { + public: + int init(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + std::string output_file_partial; + curl = curl_easy_init(); + if (!curl) { return 1; } + progress_data data; + FILE_ptr out; + if (!output_file.empty()) { + output_file_partial = output_file + ".partial"; + out.reset(fopen(output_file_partial.c_str(), "ab")); + } + + set_write_options(response_str, out); + data.file_size = set_resume_point(output_file_partial); + set_progress_options(progress, data); + set_headers(headers); + perform(url); + if (!output_file.empty()) { + std::filesystem::rename(output_file_partial, output_file); + } + return 0; } - private: - const char * program_name; - std::unordered_map string_args; - std::unordered_map int_args; - std::vector arguments; + ~CurlWrapper() { + if (chunk) { + curl_slist_free_all(chunk); + } - int parse_int_arg(const char * arg, int & value) { - char * end; - const long val = std::strtol(arg, &end, 10); - if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { - value = static_cast(val); + if (curl) { + curl_easy_cleanup(curl); + } + } + + private: + CURL * curl = nullptr; + struct curl_slist * chunk = nullptr; + + void set_write_options(std::string * response_str, const FILE_ptr & out) { + if (response_str) { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str); + } else { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get()); + } + } + + size_t set_resume_point(const std::string & output_file) { + size_t file_size = 0; + if (std::filesystem::exists(output_file)) { + file_size = std::filesystem::file_size(output_file); + curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast(file_size)); + } + + return file_size; + } + + void set_progress_options(bool progress, progress_data & data) { + if (progress) { + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); + } + } + + void set_headers(const std::vector & headers) { + if (!headers.empty()) { + if (chunk) { + curl_slist_free_all(chunk); + chunk = 0; + } + + for (const auto & header : headers) { + chunk = curl_slist_append(chunk, header.c_str()); + } + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); + } + } + + void perform(const std::string & url) { + CURLcode res; + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); + res = curl_easy_perform(curl); + if (res != CURLE_OK) { + printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + } + } + + static std::string human_readable_time(double seconds) { + int hrs = static_cast(seconds) / 3600; + int mins = (static_cast(seconds) % 3600) / 60; + int secs = static_cast(seconds) % 60; + + std::ostringstream out; + if (hrs > 0) { + out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0') + << secs << "s"; + } else if (mins > 0) { + out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s"; + } else { + out << secs << "s"; + } + + return out.str(); + } + + static std::string human_readable_size(curl_off_t size) { + static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; + char length = sizeof(suffix) / sizeof(suffix[0]); + int i = 0; + double dbl_size = size; + if (size > 1024) { + for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { + dbl_size = size / 1024.0; + } + } + + std::ostringstream out; + out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i]; + return out.str(); + } + + static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, + curl_off_t) { + progress_data * data = static_cast(ptr); + if (total_to_download <= 0) { return 0; } - return 1; - } - void print_usage() const { - printf("\nUsage:\n"); - printf(" %s [OPTIONS]\n\n", program_name); - printf("Options:\n"); - for (const auto & arg : arguments) { - printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + total_to_download += data->file_size; + const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; + const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download; + const curl_off_t pos = (percentage / 5); + std::string progress_bar; + for (int i = 0; i < 20; ++i) { + progress_bar.append((i < pos) ? "█" : " "); } - printf("\n"); + // Calculate download speed and estimated time to completion + const auto now = std::chrono::steady_clock::now(); + const std::chrono::duration elapsed_seconds = now - data->start_time; + const double speed = now_downloaded / elapsed_seconds.count(); + const double estimated_time = (total_to_download - now_downloaded) / speed; + printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(), + human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(), + speed / (1024 * 1024), human_readable_time(estimated_time).c_str()); + fflush(stderr); + data->printed = true; + + return 0; + } + + // Function to write data to a file + static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { + FILE * out = static_cast(stream); + return fwrite(ptr, size, nmemb, out); + } + + // Function to capture data into a string + static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) { + std::string * str = static_cast(stream); + str->append(static_cast(ptr), size * nmemb); + return size * nmemb; } }; +#endif class LlamaData { - public: - llama_model_ptr model; - llama_sampler_ptr sampler; - llama_context_ptr context; + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; std::vector messages; + std::vector msg_strs; + std::vector fmtted; - int init(const Options & opt) { - model = initialize_model(opt.model_path, opt.ngl); + int init(Opt & opt) { + model = initialize_model(opt); if (!model) { return 1; } - context = initialize_context(model, opt.n_ctx); + context = initialize_context(model, opt.context_size_); if (!context) { return 1; } @@ -131,15 +353,123 @@ class LlamaData { return 0; } - private: - // Initializes the model and returns a unique pointer to it - llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = ngl; + private: +#ifdef LLAMA_USE_CURL + int download(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + CurlWrapper curl; + if (curl.init(url, headers, output_file, progress, response_str)) { + return 1; + } - llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + return 0; + } +#else + int download(const std::string &, const std::vector &, const std::string &, const bool, + std::string * = nullptr) { + printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return 1; + } +#endif + + int huggingface_dl(const std::string & model, const std::vector headers, const std::string & bn) { + // Find the second occurrence of '/' after protocol string + size_t pos = model.find('/'); + pos = model.find('/', pos + 1); + if (pos == std::string::npos) { + return 1; + } + + const std::string hfr = model.substr(0, pos); + const std::string hff = model.substr(pos + 1); + const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; + return download(url, headers, bn, true); + } + + int ollama_dl(std::string & model, const std::vector headers, const std::string & bn) { + if (model.find('/') == std::string::npos) { + model = "library/" + model; + } + + std::string model_tag = "latest"; + size_t colon_pos = model.find(':'); + if (colon_pos != std::string::npos) { + model_tag = model.substr(colon_pos + 1); + model = model.substr(0, colon_pos); + } + + std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag; + std::string manifest_str; + const int ret = download(manifest_url, headers, "", false, &manifest_str); + if (ret) { + return ret; + } + + nlohmann::json manifest = nlohmann::json::parse(manifest_str); + std::string layer; + for (const auto & l : manifest["layers"]) { + if (l["mediaType"] == "application/vnd.ollama.image.model") { + layer = l["digest"]; + break; + } + } + + std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer; + return download(blob_url, headers, bn, true); + } + + std::string basename(const std::string & path) { + const size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return path; + } + + return path.substr(pos + 1); + } + + int remove_proto(std::string & model_) { + const std::string::size_type pos = model_.find("://"); + if (pos == std::string::npos) { + return 1; + } + + model_ = model_.substr(pos + 3); // Skip past "://" + return 0; + } + + int resolve_model(std::string & model_) { + const std::string bn = basename(model_); + const std::vector headers = { "--header", + "Accept: application/vnd.docker.distribution.manifest.v2+json" }; + int ret = 0; + if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) { + remove_proto(model_); + } else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { + remove_proto(model_); + ret = huggingface_dl(model_, headers, bn); + } else if (string_starts_with(model_, "ollama://")) { + remove_proto(model_); + ret = ollama_dl(model_, headers, bn); + } else if (string_starts_with(model_, "https://")) { + download(model_, headers, bn, true); + } else { + ret = ollama_dl(model_, headers, bn); + } + + model_ = bn; + + return ret; + } + + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(Opt & opt) { + ggml_backend_load_all(); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers; + resolve_model(opt.model_); + llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params)); if (!model) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } return model; @@ -148,12 +478,11 @@ class LlamaData { // Initializes the context with the specified parameters llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = n_ctx; - ctx_params.n_batch = n_ctx; - + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); if (!context) { - fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + printe("%s: error: failed to create the llama_context\n", __func__); } return context; @@ -170,23 +499,22 @@ class LlamaData { } }; -// Add a message to `messages` and store its content in `owned_content` -static void add_message(const char * role, const std::string & text, LlamaData & llama_data, - std::vector & owned_content) { - char_array_ptr content(new char[text.size() + 1]); - std::strcpy(content.get(), text.c_str()); - llama_data.messages.push_back({role, content.get()}); - owned_content.push_back(std::move(content)); +// Add a message to `messages` and store its content in `msg_strs` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data) { + llama_data.msg_strs.push_back(std::move(text)); + llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() }); } // Function to apply the chat template and resize `formatted` if needed -static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { - int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); - if (result > static_cast(formatted.size())) { - formatted.resize(result); +static int apply_chat_template(LlamaData & llama_data, const bool append) { + int result = llama_chat_apply_template( + llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append, + append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); + if (append && result > static_cast(llama_data.fmtted.size())) { + llama_data.fmtted.resize(result); result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); + llama_data.messages.size(), append, llama_data.fmtted.data(), + llama_data.fmtted.size()); } return result; @@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr prompt_tokens.resize(n_prompt_tokens); if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { - GGML_ABORT("failed to tokenize the prompt\n"); + printe("failed to tokenize the prompt\n"); + return -1; } return n_prompt_tokens; @@ -207,11 +536,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx = llama_n_ctx(ctx.get()); const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); - fprintf(stderr, "context size exceeded\n"); + printe("context size exceeded\n"); return 1; } @@ -221,9 +550,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch & // convert the token to a string static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { char buf[256]; - int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); if (n < 0) { - GGML_ABORT("failed to convert token to piece\n"); + printe("failed to convert token to piece\n"); + return 1; } piece = std::string(buf, n); @@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { - std::vector prompt_tokens; - const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); - if (n_prompt_tokens < 0) { + std::vector tokens; + if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) { return 1; } // prepare a batch for the prompt - llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; while (true) { check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { - GGML_ABORT("failed to decode\n"); + printe("failed to decode\n"); + return 1; } // sample the next token, check is it an end of generation? @@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str return 0; } -static int parse_arguments(const int argc, const char ** argv, Options & opt) { - ArgumentParser parser(argv[0]); - parser.add_argument("-m", opt.model_path, "model"); - parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); - parser.add_argument("-c", opt.n_ctx, "context_size"); - parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); - if (parser.parse(argc, argv)) { - return 1; - } - - return 0; -} - static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Indicate an error or empty input + return user.empty(); // Should have data in happy path } // Function to generate a response based on the prompt @@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt, // Set response color printf("\033[33m"); if (generate(llama_data, prompt, response)) { - fprintf(stderr, "failed to generate response\n"); + printe("failed to generate response\n"); return 1; } @@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt, } // Helper function to apply the chat template and handle errors -static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, - const bool is_user_input, int & output_length) { - const int new_len = apply_chat_template(llama_data, formatted, is_user_input); +static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) { + const int new_len = apply_chat_template(llama_data, append); if (new_len < 0) { - fprintf(stderr, "failed to apply the chat template\n"); + printe("failed to apply the chat template\n"); return -1; } @@ -319,56 +635,63 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data, } // Helper function to handle user input -static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { - if (!prompt_non_interactive.empty()) { - user_input = prompt_non_interactive; - return true; // No need for interactive input +static int handle_user_input(std::string & user_input, const std::string & user_) { + if (!user_.empty()) { + user_input = user_; + return 0; // No need for interactive input } - printf("\033[32m> \033[0m"); - return !read_user_input(user_input); // Returns false if input ends the loop + printf( + "\r " + "\r\033[32m> \033[0m"); + return read_user_input(user_input); // Returns true if input ends the loop } // Function to tokenize the prompt -static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { - std::vector owned_content; - std::vector fmtted(llama_n_ctx(llama_data.context.get())); +static int chat_loop(LlamaData & llama_data, const std::string & user_) { int prev_len = 0; - + llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); while (true) { // Get user input std::string user_input; - if (!handle_user_input(user_input, prompt_non_interactive)) { - break; + while (handle_user_input(user_input, user_)) { } - add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, - owned_content); - + add_message("user", user_.empty() ? user_input : user_, llama_data); int new_len; - if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) { return 1; } - std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); std::string response; if (generate_response(llama_data, prompt, response)) { return 1; } + + if (!user_.empty()) { + break; + } + + add_message("assistant", response, llama_data); + if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) { + return 1; + } } + return 0; } static void log_callback(const enum ggml_log_level level, const char * text, void *) { if (level == GGML_LOG_LEVEL_ERROR) { - fprintf(stderr, "%s", text); + printe("%s", text); } } static bool is_stdin_a_terminal() { #if defined(_WIN32) HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; + DWORD mode; return GetConsoleMode(hStdin, &mode); #else return isatty(STDIN_FILENO); @@ -382,17 +705,20 @@ static std::string read_pipe_data() { } int main(int argc, const char ** argv) { - Options opt; - if (parse_arguments(argc, argv, opt)) { + Opt opt; + const int ret = opt.init(argc, argv); + if (ret == 2) { + return 0; + } else if (ret) { return 1; } if (!is_stdin_a_terminal()) { - if (!opt.prompt_non_interactive.empty()) { - opt.prompt_non_interactive += "\n\n"; + if (!opt.user_.empty()) { + opt.user_ += "\n\n"; } - opt.prompt_non_interactive += read_pipe_data(); + opt.user_ += read_pipe_data(); } llama_log_set(log_callback, nullptr); @@ -401,7 +727,7 @@ int main(int argc, const char ** argv) { return 1; } - if (chat_loop(llama_data, opt.prompt_non_interactive)) { + if (chat_loop(llama_data, opt.user_)) { return 1; } diff --git a/examples/server/README.md b/examples/server/README.md index 6294f541f..a9443e56b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | @@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co | -------- | ----------- | | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `-sp, --special` | special tokens output enabled (default: false) | +| `--no-warmup` | skip warming up the model with an empty run | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | @@ -146,7 +147,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | -| `--no-webui` | disable the Web UI
(env: LLAMA_ARG_NO_WEBUI) | +| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | @@ -164,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) | -| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) | -| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) | +| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | +| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | -| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | +| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | +| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. @@ -303,23 +304,23 @@ mkdir llama-client cd llama-client ``` -Create a index.js file and put this inside: +Create an index.js file and put this inside: ```javascript -const prompt = `Building a website can be done in 10 simple steps:`; +const prompt = "Building a website can be done in 10 simple steps:" -async function Test() { +async function test() { let response = await fetch("http://127.0.0.1:8080/completion", { - method: 'POST', + method: "POST", body: JSON.stringify({ prompt, - n_predict: 512, + n_predict: 64, }) }) console.log((await response.json()).content) } -Test() +test() ``` And run it: @@ -381,7 +382,7 @@ Multiple prompts are also supported. In this case, the completion result will be `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. -`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. +`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` @@ -442,11 +443,11 @@ These words will not be included in the completion, so make sure to add them to `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` **Response format** -- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. +- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a44..9a19c5e83 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
-
+

Conversations

@@ -204,51 +204,25 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au( {{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
Advanced config
+
+ + Show tokens per second +
+ + + + +