diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index 550ee1b49..f10b3a2b2 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -24,7 +24,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -41,14 +42,17 @@ body: description: Which GGML backends do you know to be affected? options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] multiple: true + validations: + required: true - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it. If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. placeholder: > + I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY. Here are the exact commands that I used: ... validations: required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index 1adb162b7..1ccef0793 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -26,7 +26,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -43,6 +44,8 @@ body: description: Which GGML backends do you know to be affected? options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] multiple: true + validations: + required: true - type: textarea id: hardware attributes: @@ -55,20 +58,20 @@ body: - type: textarea id: model attributes: - label: Model + label: Models description: > - Which model at which quantization were you using when encountering the bug? + Which model(s) at which quantization were you using when encountering the bug? If you downloaded a GGUF file off of Huggingface, please provide a link. placeholder: > e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M validations: required: false - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it. If you can narrow down the bug to specific hardware, compile flags, or command line arguments, that information would be very much appreciated by us. placeholder: > diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 124cdee91..d157ea307 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -14,7 +14,7 @@ body: id: version attributes: label: Name and Version - description: Which version of our software are you running? (use `--version` to get a version string) + description: Which version of our software is affected? (You can use `--version` to get a version string.) placeholder: | $./llama-cli --version version: 2999 (42b4109e) @@ -24,7 +24,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -33,28 +34,30 @@ body: - BSD - Other? (Please let us know in description) validations: - required: true + required: false - type: dropdown id: module attributes: label: Which llama.cpp modules do you know to be affected? multiple: true options: + - Documentation/Github - libllama (core library) - llama-cli - llama-server - llama-bench - llama-quantize - Python/Bash scripts + - Test code - Other (Please specify in the next section) validations: - required: true + required: false - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it (if applicable). validations: required: true - type: textarea @@ -62,7 +65,7 @@ body: attributes: label: First Bad Commit description: > - If the bug was not present on an earlier version: when did it start appearing? + If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing? If possible, please do a git bisect and identify the exact commit that introduced the bug. validations: required: false @@ -71,8 +74,8 @@ body: attributes: label: Relevant log output description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. + If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: - required: true + required: false diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 572f91643..abaf2c504 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -952,7 +952,7 @@ jobs: env: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" steps: - name: Clone @@ -962,7 +962,8 @@ jobs: fetch-depth: 0 - name: Install - run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - name: Build id: cmake_build @@ -981,27 +982,34 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi - - name: Pack artifacts + - name: Build the release package id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin + echo "cp oneAPI running time dll files to ./build/bin done" 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + - name: Upload the release package + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} uses: actions/upload-artifact@v4 with: path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a953cdac9..9cef283d9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,12 +10,10 @@ name: Publish Docker image on: - #pull_request: - push: - branches: - - master - paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] - workflow_dispatch: # allows manual triggering, useful for debugging + workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because it is expensive + - cron: '12 4 * * *' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -29,7 +27,6 @@ permissions: jobs: push_to_registry: name: Push Docker image to Docker Hub - #if: github.event.pull_request.draft == false runs-on: ubuntu-latest env: diff --git a/CMakeLists.txt b/CMakeLists.txt index 994e61e45..e7d91a5b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) - -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) +# all public headers +set(LLAMA_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) +set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") install(TARGETS llama LIBRARY PUBLIC_HEADER) configure_package_config_file( diff --git a/Makefile b/Makefile index dd6d864ad..cfc74c1dc 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ BUILD_TARGETS = \ llama-server \ llama-simple \ llama-simple-chat \ + llama-run \ llama-speculative \ llama-tokenize \ llama-vdot \ @@ -251,7 +252,7 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 @@ -290,6 +291,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, @@ -1166,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-run: examples/run/run.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-simple: examples/simple/simple.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/Package.swift b/Package.swift index 6b68aecde..d9e8a4e2d 100644 --- a/Package.swift +++ b/Package.swift @@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), + .define("GGML_USE_CPU") ] ) #endif diff --git a/common/arg.cpp b/common/arg.cpp index 32240f21f..272492e50 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -298,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) { print_options(specific_options); } +static std::vector parse_device_list(const std::string & value) { + std::vector devices; + auto dev_names = string_split(value, ','); + if (dev_names.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (dev_names.size() == 1 && dev_names[0] == "none") { + devices.push_back(nullptr); + } else { + for (const auto & device : dev_names) { + auto * dev = ggml_backend_dev_by_name(device.c_str()); + if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + throw std::invalid_argument(string_format("invalid device: %s", device.c_str())); + } + devices.push_back(dev); + } + devices.push_back(nullptr); + } + return devices; +} + bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -324,6 +345,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e } common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + // load dynamic backends + ggml_backend_load_all(); + common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; @@ -1312,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); + add_opt(common_arg( + {"-dev", "--device"}, "", + "comma-separated list of devices to use for offloading (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.devices = parse_device_list(value); + } + ).set_env("LLAMA_ARG_DEVICE")); + add_opt(common_arg( + {"--list-devices"}, + "print list of available devices and exit", + [](common_params &) { + printf("Available devices:\n"); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + } + exit(0); + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", @@ -1336,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { throw std::invalid_argument("invalid value"); @@ -2042,6 +2086,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.n_ctx = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-devd", "--device-draft"}, "", + "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.speculative.devices = parse_device_list(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", diff --git a/common/common.cpp b/common/common.cpp index c398329d0..09ec9f238 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -979,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector devices; // devices to use for offloading int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding @@ -178,9 +179,6 @@ struct common_params { int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) @@ -193,6 +191,13 @@ struct common_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = 0.1f; // KV cache defragmentation threshold + // offload params + std::vector devices; // devices to use for offloading + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -201,7 +206,6 @@ struct common_params { ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings @@ -462,7 +466,7 @@ struct common_init_result { struct common_init_result common_init_from_params(common_params & params); -struct llama_model_params common_model_params_to_llama (const common_params & params); +struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); diff --git a/common/speculative.cpp b/common/speculative.cpp index fe315a270..e559675c4 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -90,9 +90,10 @@ bool common_speculative_are_compatible( if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft) - ) { + llama_token_eos(model_tgt) != llama_token_eos(model_dft)) { LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt)); + LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft)); return false; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 80a179b86..b931049d1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3040,9 +3040,9 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo1124ForCausalLM") -class Olmo1124Model(Model): - model_arch = gguf.MODEL_ARCH.OLMO_1124 +@Model.register("Olmo2ForCausalLM") +class Olmo2Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO2 @Model.register("OlmoeForCausalLM") diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9bd099d4e..21db1f3c2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,29 +24,36 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) + add_subdirectory(run) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38..5d1048aad 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75c..bac606f47 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 957451af7..d0c28f317 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt new file mode 100644 index 000000000..084f1e92d --- /dev/null +++ b/examples/run/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-run) +add_executable(${TARGET} run.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/run/README.md b/examples/run/README.md new file mode 100644 index 000000000..6e926811f --- /dev/null +++ b/examples/run/README.md @@ -0,0 +1,7 @@ +# llama.cpp/example/run + +The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. + +```bash +./llama-run Meta-Llama-3.1-8B-Instruct.gguf +... diff --git a/examples/run/run.cpp b/examples/run/run.cpp new file mode 100644 index 000000000..cac2faefc --- /dev/null +++ b/examples/run/run.cpp @@ -0,0 +1,409 @@ +#if defined(_WIN32) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama-cpp.h" + +typedef std::unique_ptr char_array_ptr; + +struct Argument { + std::string flag; + std::string help_text; +}; + +struct Options { + std::string model_path, prompt_non_interactive; + int ngl = 99; + int n_ctx = 2048; +}; + +class ArgumentParser { + public: + ArgumentParser(const char * program_name) : program_name(program_name) {} + + void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { + string_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { + int_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + int parse(int argc, const char ** argv) { + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (string_args.count(arg)) { + if (i + 1 < argc) { + *string_args[arg] = argv[++i]; + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else if (int_args.count(arg)) { + if (i + 1 < argc) { + if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { + fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); + print_usage(); + return 1; + } + } + + if (string_args["-m"]->empty()) { + fprintf(stderr, "error: -m is required\n"); + print_usage(); + return 1; + } + + return 0; + } + + private: + const char * program_name; + std::unordered_map string_args; + std::unordered_map int_args; + std::vector arguments; + + int parse_int_arg(const char * arg, int & value) { + char * end; + const long val = std::strtol(arg, &end, 10); + if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { + value = static_cast(val); + return 0; + } + return 1; + } + + void print_usage() const { + printf("\nUsage:\n"); + printf(" %s [OPTIONS]\n\n", program_name); + printf("Options:\n"); + for (const auto & arg : arguments) { + printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + } + + printf("\n"); + } +}; + +class LlamaData { + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; + std::vector messages; + + int init(const Options & opt) { + model = initialize_model(opt.model_path, opt.ngl); + if (!model) { + return 1; + } + + context = initialize_context(model, opt.n_ctx); + if (!context) { + return 1; + } + + sampler = initialize_sampler(); + return 0; + } + + private: + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + if (!model) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + } + + return model; + } + + // Initializes the context with the specified parameters + llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; + + llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); + if (!context) { + fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + } + + return context; + } + + // Initializes and configures the sampler + llama_sampler_ptr initialize_sampler() { + llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + return sampler; + } +}; + +// Add a message to `messages` and store its content in `owned_content` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data, + std::vector & owned_content) { + char_array_ptr content(new char[text.size() + 1]); + std::strcpy(content.get(), text.c_str()); + llama_data.messages.push_back({role, content.get()}); + owned_content.push_back(std::move(content)); +} + +// Function to apply the chat template and resize `formatted` if needed +static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { + int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + if (result > static_cast(formatted.size())) { + formatted.resize(result); + result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + } + + return result; +} + +// Function to tokenize the prompt +static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt, + std::vector & prompt_tokens) { + const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true); + prompt_tokens.resize(n_prompt_tokens); + if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, + true) < 0) { + GGML_ABORT("failed to tokenize the prompt\n"); + } + + return n_prompt_tokens; +} + +// Check if we have enough space in the context to evaluate this batch +static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { + const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + if (n_ctx_used + batch.n_tokens > n_ctx) { + printf("\033[0m\n"); + fprintf(stderr, "context size exceeded\n"); + return 1; + } + + return 0; +} + +// convert the token to a string +static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { + char buf[256]; + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + GGML_ABORT("failed to convert token to piece\n"); + } + + piece = std::string(buf, n); + return 0; +} + +static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) { + printf("%s", piece.c_str()); + fflush(stdout); + response += piece; +} + +// helper function to evaluate a prompt and generate a response +static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { + std::vector prompt_tokens; + const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); + if (n_prompt_tokens < 0) { + return 1; + } + + // prepare a batch for the prompt + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_token new_token_id; + while (true) { + check_context_size(llama_data.context, batch); + if (llama_decode(llama_data.context.get(), batch)) { + GGML_ABORT("failed to decode\n"); + } + + // sample the next token, check is it an end of generation? + new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); + if (llama_token_is_eog(llama_data.model.get(), new_token_id)) { + break; + } + + std::string piece; + if (convert_token_to_string(llama_data.model, new_token_id, piece)) { + return 1; + } + + print_word_and_concatenate_to_response(piece, response); + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + } + + return 0; +} + +static int parse_arguments(const int argc, const char ** argv, Options & opt) { + ArgumentParser parser(argv[0]); + parser.add_argument("-m", opt.model_path, "model"); + parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); + parser.add_argument("-c", opt.n_ctx, "context_size"); + parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); + if (parser.parse(argc, argv)) { + return 1; + } + + return 0; +} + +static int read_user_input(std::string & user) { + std::getline(std::cin, user); + return user.empty(); // Indicate an error or empty input +} + +// Function to generate a response based on the prompt +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { + // Set response color + printf("\033[33m"); + if (generate(llama_data, prompt, response)) { + fprintf(stderr, "failed to generate response\n"); + return 1; + } + + // End response with color reset and newline + printf("\n\033[0m"); + return 0; +} + +// Helper function to apply the chat template and handle errors +static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, + const bool is_user_input, int & output_length) { + const int new_len = apply_chat_template(llama_data, formatted, is_user_input); + if (new_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return -1; + } + + output_length = new_len; + return 0; +} + +// Helper function to handle user input +static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { + if (!prompt_non_interactive.empty()) { + user_input = prompt_non_interactive; + return true; // No need for interactive input + } + + printf("\033[32m> \033[0m"); + return !read_user_input(user_input); // Returns false if input ends the loop +} + +// Function to tokenize the prompt +static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { + std::vector owned_content; + std::vector fmtted(llama_n_ctx(llama_data.context.get())); + int prev_len = 0; + + while (true) { + // Get user input + std::string user_input; + if (!handle_user_input(user_input, prompt_non_interactive)) { + break; + } + + add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, + owned_content); + + int new_len; + if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + return 1; + } + + std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string response; + if (generate_response(llama_data, prompt, response)) { + return 1; + } + } + return 0; +} + +static void log_callback(const enum ggml_log_level level, const char * text, void *) { + if (level == GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "%s", text); + } +} + +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static std::string read_pipe_data() { + std::ostringstream result; + result << std::cin.rdbuf(); // Read all data from std::cin + return result.str(); +} + +int main(int argc, const char ** argv) { + Options opt; + if (parse_arguments(argc, argv, opt)) { + return 1; + } + + if (!is_stdin_a_terminal()) { + if (!opt.prompt_non_interactive.empty()) { + opt.prompt_non_interactive += "\n\n"; + } + + opt.prompt_non_interactive += read_pipe_data(); + } + + llama_log_set(log_callback, nullptr); + LlamaData llama_data; + if (llama_data.init(opt)) { + return 1; + } + + if (chat_loop(llama_data, opt.prompt_non_interactive)) { + return 1; + } + + return 0; +} diff --git a/examples/server/README.md b/examples/server/README.md index 0936e0b7b..877768c8b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -412,7 +412,7 @@ node index.js `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` + `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 6216c0841..c54260867 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -81,7 +81,13 @@ - +