Merge remote-tracking branch 'upstream/master'

This commit is contained in:
shanshan shen 2024-11-26 07:10:23 +00:00
commit cf6b987be3
69 changed files with 2029 additions and 831 deletions

View file

@ -24,7 +24,8 @@ body:
- type: dropdown - type: dropdown
id: operating-system id: operating-system
attributes: attributes:
label: Which operating systems do you know to be affected? label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true multiple: true
options: options:
- Linux - Linux
@ -41,14 +42,17 @@ body:
description: Which GGML backends do you know to be affected? description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true multiple: true
validations:
required: true
- type: textarea - type: textarea
id: steps_to_reproduce id: info
attributes: attributes:
label: Steps to Reproduce label: Problem description & steps to reproduce
description: > description: >
Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
placeholder: > placeholder: >
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
Here are the exact commands that I used: ... Here are the exact commands that I used: ...
validations: validations:
required: true required: true

View file

@ -26,7 +26,8 @@ body:
- type: dropdown - type: dropdown
id: operating-system id: operating-system
attributes: attributes:
label: Which operating systems do you know to be affected? label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true multiple: true
options: options:
- Linux - Linux
@ -43,6 +44,8 @@ body:
description: Which GGML backends do you know to be affected? description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true multiple: true
validations:
required: true
- type: textarea - type: textarea
id: hardware id: hardware
attributes: attributes:
@ -55,20 +58,20 @@ body:
- type: textarea - type: textarea
id: model id: model
attributes: attributes:
label: Model label: Models
description: > description: >
Which model at which quantization were you using when encountering the bug? Which model(s) at which quantization were you using when encountering the bug?
If you downloaded a GGUF file off of Huggingface, please provide a link. If you downloaded a GGUF file off of Huggingface, please provide a link.
placeholder: > placeholder: >
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
validations: validations:
required: false required: false
- type: textarea - type: textarea
id: steps_to_reproduce id: info
attributes: attributes:
label: Steps to Reproduce label: Problem description & steps to reproduce
description: > description: >
Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments, If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us. that information would be very much appreciated by us.
placeholder: > placeholder: >

View file

@ -14,7 +14,7 @@ body:
id: version id: version
attributes: attributes:
label: Name and Version label: Name and Version
description: Which version of our software are you running? (use `--version` to get a version string) description: Which version of our software is affected? (You can use `--version` to get a version string.)
placeholder: | placeholder: |
$./llama-cli --version $./llama-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
@ -24,7 +24,8 @@ body:
- type: dropdown - type: dropdown
id: operating-system id: operating-system
attributes: attributes:
label: Which operating systems do you know to be affected? label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true multiple: true
options: options:
- Linux - Linux
@ -33,28 +34,30 @@ body:
- BSD - BSD
- Other? (Please let us know in description) - Other? (Please let us know in description)
validations: validations:
required: true required: false
- type: dropdown - type: dropdown
id: module id: module
attributes: attributes:
label: Which llama.cpp modules do you know to be affected? label: Which llama.cpp modules do you know to be affected?
multiple: true multiple: true
options: options:
- Documentation/Github
- libllama (core library) - libllama (core library)
- llama-cli - llama-cli
- llama-server - llama-server
- llama-bench - llama-bench
- llama-quantize - llama-quantize
- Python/Bash scripts - Python/Bash scripts
- Test code
- Other (Please specify in the next section) - Other (Please specify in the next section)
validations: validations:
required: true required: false
- type: textarea - type: textarea
id: steps_to_reproduce id: info
attributes: attributes:
label: Steps to Reproduce label: Problem description & steps to reproduce
description: > description: >
Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. Please give us a summary of the problem and tell us how to reproduce it (if applicable).
validations: validations:
required: true required: true
- type: textarea - type: textarea
@ -62,7 +65,7 @@ body:
attributes: attributes:
label: First Bad Commit label: First Bad Commit
description: > description: >
If the bug was not present on an earlier version: when did it start appearing? If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug. If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations: validations:
required: false required: false
@ -71,8 +74,8 @@ body:
attributes: attributes:
label: Relevant log output label: Relevant log output
description: > description: >
Please copy and paste any relevant log output, including the command that you entered and any generated text. If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
This will be automatically formatted into code, so no need for backticks. This will be automatically formatted into code, so no need for backticks.
render: shell render: shell
validations: validations:
required: true required: false

View file

@ -952,7 +952,7 @@ jobs:
env: env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps: steps:
- name: Clone - name: Clone
@ -962,7 +962,8 @@ jobs:
fetch-depth: 0 fetch-depth: 0
- name: Install - name: Install
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -981,27 +982,34 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi fi
- name: Pack artifacts - name: Build the release package
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
run: | run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done" echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload the release package
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip

View file

@ -10,12 +10,10 @@
name: Publish Docker image name: Publish Docker image
on: on:
#pull_request: workflow_dispatch: # allows manual triggering
push: schedule:
branches: # Rebuild daily rather than on every push because it is expensive
- master - cron: '12 4 * * *'
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
workflow_dispatch: # allows manual triggering, useful for debugging
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
jobs: jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
#if: github.event.pull_request.draft == false
runs-on: ubuntu-latest runs-on: ubuntu-latest
env: env:

View file

@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif() endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
# all public headers
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set(LLAMA_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS llama LIBRARY PUBLIC_HEADER)
configure_package_config_file( configure_package_config_file(

View file

@ -34,6 +34,7 @@ BUILD_TARGETS = \
llama-server \ llama-server \
llama-simple \ llama-simple \
llama-simple-chat \ llama-simple-chat \
llama-run \
llama-speculative \ llama-speculative \
llama-tokenize \ llama-tokenize \
llama-vdot \ llama-vdot \
@ -251,7 +252,7 @@ endif
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11 MK_NVCCFLAGS = -std=c++11
@ -290,6 +291,7 @@ endif
# some memory allocation are available on Linux through GNU extensions in libc # some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GNU_SOURCE MK_CPPFLAGS += -D_GNU_SOURCE
MK_LDFLAGS += -ldl
endif endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -1166,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-run: examples/run/run.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple: examples/simple/simple.cpp \ llama-simple: examples/simple/simple.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append( cSettings.append(
contentsOf: [ contentsOf: [
.define("GGML_USE_ACCELERATE"), .define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL") .define("GGML_USE_METAL"),
.define("GGML_USE_CPU")
] ]
) )
#endif #endif

View file

@ -298,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
print_options(specific_options); print_options(specific_options);
} }
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
std::vector<ggml_backend_dev_t> devices;
auto dev_names = string_split<std::string>(value, ',');
if (dev_names.empty()) {
throw std::invalid_argument("no devices specified");
}
if (dev_names.size() == 1 && dev_names[0] == "none") {
devices.push_back(nullptr);
} else {
for (const auto & device : dev_names) {
auto * dev = ggml_backend_dev_by_name(device.c_str());
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
}
devices.push_back(dev);
}
devices.push_back(nullptr);
}
return devices;
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage); auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -324,6 +345,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
} }
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// load dynamic backends
ggml_backend_load_all();
common_params_context ctx_arg(params); common_params_context ctx_arg(params);
ctx_arg.print_usage = print_usage; ctx_arg.print_usage = print_usage;
ctx_arg.ex = ex; ctx_arg.ex = ex;
@ -1312,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
else { throw std::invalid_argument("invalid value"); } else { throw std::invalid_argument("invalid value"); }
} }
).set_env("LLAMA_ARG_NUMA")); ).set_env("LLAMA_ARG_NUMA"));
add_opt(common_arg(
{"-dev", "--device"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading (none = don't offload)\n"
"use --list-devices to see a list of available devices",
[](common_params & params, const std::string & value) {
params.devices = parse_device_list(value);
}
).set_env("LLAMA_ARG_DEVICE"));
add_opt(common_arg(
{"--list-devices"},
"print list of available devices and exit",
[](common_params &) {
printf("Available devices:\n");
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
}
}
exit(0);
}
));
add_opt(common_arg( add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM", "number of layers to store in VRAM",
@ -1336,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} else if (arg_next == "layer") { } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER; params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") { } else if (arg_next == "row") {
#ifdef GGML_USE_SYCL
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
exit(1);
#endif // GGML_USE_SYCL
params.split_mode = LLAMA_SPLIT_MODE_ROW; params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else { } else {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -2042,6 +2086,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.n_ctx = value; params.speculative.n_ctx = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
"use --list-devices to see a list of available devices",
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model", "number of layers to store in VRAM for the draft model",

View file

@ -979,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
} }
} }
struct llama_model_params common_model_params_to_llama(const common_params & params) { struct llama_model_params common_model_params_to_llama(common_params & params) {
auto mparams = llama_model_default_params(); auto mparams = llama_model_default_params();
if (!params.devices.empty()) {
mparams.devices = params.devices.data();
}
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
} }

View file

@ -156,6 +156,7 @@ struct common_params_sampling {
}; };
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@ -178,9 +179,6 @@ struct common_params {
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled) int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@ -193,6 +191,13 @@ struct common_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
@ -201,7 +206,6 @@ struct common_params {
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@ -462,7 +466,7 @@ struct common_init_result {
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama (const common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

View file

@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft) llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
return false; return false;
} }

View file

@ -3040,9 +3040,9 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("Olmo1124ForCausalLM") @Model.register("Olmo2ForCausalLM")
class Olmo1124Model(Model): class Olmo2Model(Model):
model_arch = gguf.MODEL_ARCH.OLMO_1124 model_arch = gguf.MODEL_ARCH.OLMO2
@Model.register("OlmoeForCausalLM") @Model.register("OlmoeForCausalLM")

View file

@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
add_subdirectory(cvector-generator)
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(gbnf-validator) add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
@ -27,29 +24,36 @@ else()
add_subdirectory(imatrix) add_subdirectory(imatrix)
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(parallel) add_subdirectory(parallel)
add_subdirectory(passkey) add_subdirectory(passkey)
add_subdirectory(perplexity) add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize) add_subdirectory(quantize)
add_subdirectory(retrieval) add_subdirectory(retrieval)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER) if (LLAMA_BUILD_SERVER)
add_subdirectory(server) add_subdirectory(server)
endif() endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(run)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(simple-chat) add_subdirectory(simple-chat)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(speculative-simple) add_subdirectory(speculative-simple)
add_subdirectory(tokenize) add_subdirectory(tokenize)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
add_subdirectory(quantize-stats)
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()
endif()
endif() endif()

View file

@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback) set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View file

@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
cmd_params params = parse_cmd_params(argc, argv); cmd_params params = parse_cmd_params(argc, argv);
// initialize backends
ggml_backend_load_all();
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
return 1;
}
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
// initialize llama.cpp // initialize llama.cpp
if (!params.verbose) { if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL); llama_log_set(llama_null_log_callback, NULL);
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
tpp.poll = t.poll; tpp.poll = t.poll;
tpp.prio = params.prio; tpp.prio = params.prio;
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) { if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1); exit(1);
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
ggml_threadpool_free(threadpool); ggml_threadpool_free_fn(threadpool);
} }
llama_free_model(lmodel); llama_free_model(lmodel);

View file

@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
struct ggml_threadpool_params tpp_batch = struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp = struct ggml_threadpool_params tpp =
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool_batch = NULL; struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch); threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
if (!threadpool_batch) { if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1; return 1;
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
tpp.paused = true; tpp.paused = true;
} }
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) { if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1; return 1;
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
ggml_threadpool_free(threadpool); ggml_threadpool_free_fn(threadpool);
ggml_threadpool_free(threadpool_batch); ggml_threadpool_free_fn(threadpool_batch);
return 0; return 0;
} }

View file

@ -0,0 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

7
examples/run/README.md Normal file
View file

@ -0,0 +1,7 @@
# llama.cpp/example/run
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
```bash
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
...

409
examples/run/run.cpp Normal file
View file

@ -0,0 +1,409 @@
#if defined(_WIN32)
#include <windows.h>
#else
#include <unistd.h>
#endif
#include <climits>
#include <cstdio>
#include <cstring>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "llama-cpp.h"
typedef std::unique_ptr<char[]> char_array_ptr;
struct Argument {
std::string flag;
std::string help_text;
};
struct Options {
std::string model_path, prompt_non_interactive;
int ngl = 99;
int n_ctx = 2048;
};
class ArgumentParser {
public:
ArgumentParser(const char * program_name) : program_name(program_name) {}
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
string_args[flag] = &var;
arguments.push_back({flag, help_text});
}
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
int_args[flag] = &var;
arguments.push_back({flag, help_text});
}
int parse(int argc, const char ** argv) {
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
if (string_args.count(arg)) {
if (i + 1 < argc) {
*string_args[arg] = argv[++i];
} else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
print_usage();
return 1;
}
} else if (int_args.count(arg)) {
if (i + 1 < argc) {
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
print_usage();
return 1;
}
} else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
print_usage();
return 1;
}
} else {
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
print_usage();
return 1;
}
}
if (string_args["-m"]->empty()) {
fprintf(stderr, "error: -m is required\n");
print_usage();
return 1;
}
return 0;
}
private:
const char * program_name;
std::unordered_map<std::string, std::string *> string_args;
std::unordered_map<std::string, int *> int_args;
std::vector<Argument> arguments;
int parse_int_arg(const char * arg, int & value) {
char * end;
const long val = std::strtol(arg, &end, 10);
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
value = static_cast<int>(val);
return 0;
}
return 1;
}
void print_usage() const {
printf("\nUsage:\n");
printf(" %s [OPTIONS]\n\n", program_name);
printf("Options:\n");
for (const auto & arg : arguments) {
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
}
printf("\n");
}
};
class LlamaData {
public:
llama_model_ptr model;
llama_sampler_ptr sampler;
llama_context_ptr context;
std::vector<llama_chat_message> messages;
int init(const Options & opt) {
model = initialize_model(opt.model_path, opt.ngl);
if (!model) {
return 1;
}
context = initialize_context(model, opt.n_ctx);
if (!context) {
return 1;
}
sampler = initialize_sampler();
return 0;
}
private:
// Initializes the model and returns a unique pointer to it
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
if (!model) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
}
return model;
}
// Initializes the context with the specified parameters
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_ctx;
ctx_params.n_batch = n_ctx;
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
if (!context) {
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
}
return context;
}
// Initializes and configures the sampler
llama_sampler_ptr initialize_sampler() {
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
return sampler;
}
};
// Add a message to `messages` and store its content in `owned_content`
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
std::vector<char_array_ptr> & owned_content) {
char_array_ptr content(new char[text.size() + 1]);
std::strcpy(content.get(), text.c_str());
llama_data.messages.push_back({role, content.get()});
owned_content.push_back(std::move(content));
}
// Function to apply the chat template and resize `formatted` if needed
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
llama_data.messages.size(), append, formatted.data(), formatted.size());
if (result > static_cast<int>(formatted.size())) {
formatted.resize(result);
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
llama_data.messages.size(), append, formatted.data(), formatted.size());
}
return result;
}
// Function to tokenize the prompt
static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
std::vector<llama_token> & prompt_tokens) {
const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n");
}
return n_prompt_tokens;
}
// Check if we have enough space in the context to evaluate this batch
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
const int n_ctx = llama_n_ctx(ctx.get());
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
return 1;
}
return 0;
}
// convert the token to a string
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
char buf[256];
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
GGML_ABORT("failed to convert token to piece\n");
}
piece = std::string(buf, n);
return 0;
}
static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
printf("%s", piece.c_str());
fflush(stdout);
response += piece;
}
// helper function to evaluate a prompt and generate a response
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
std::vector<llama_token> prompt_tokens;
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
if (n_prompt_tokens < 0) {
return 1;
}
// prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
llama_token new_token_id;
while (true) {
check_context_size(llama_data.context, batch);
if (llama_decode(llama_data.context.get(), batch)) {
GGML_ABORT("failed to decode\n");
}
// sample the next token, check is it an end of generation?
new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
break;
}
std::string piece;
if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
return 1;
}
print_word_and_concatenate_to_response(piece, response);
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1);
}
return 0;
}
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
ArgumentParser parser(argv[0]);
parser.add_argument("-m", opt.model_path, "model");
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
parser.add_argument("-c", opt.n_ctx, "context_size");
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
if (parser.parse(argc, argv)) {
return 1;
}
return 0;
}
static int read_user_input(std::string & user) {
std::getline(std::cin, user);
return user.empty(); // Indicate an error or empty input
}
// Function to generate a response based on the prompt
static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
// Set response color
printf("\033[33m");
if (generate(llama_data, prompt, response)) {
fprintf(stderr, "failed to generate response\n");
return 1;
}
// End response with color reset and newline
printf("\n\033[0m");
return 0;
}
// Helper function to apply the chat template and handle errors
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
const bool is_user_input, int & output_length) {
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
if (new_len < 0) {
fprintf(stderr, "failed to apply the chat template\n");
return -1;
}
output_length = new_len;
return 0;
}
// Helper function to handle user input
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
if (!prompt_non_interactive.empty()) {
user_input = prompt_non_interactive;
return true; // No need for interactive input
}
printf("\033[32m> \033[0m");
return !read_user_input(user_input); // Returns false if input ends the loop
}
// Function to tokenize the prompt
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
std::vector<char_array_ptr> owned_content;
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
int prev_len = 0;
while (true) {
// Get user input
std::string user_input;
if (!handle_user_input(user_input, prompt_non_interactive)) {
break;
}
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
owned_content);
int new_len;
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
return 1;
}
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
std::string response;
if (generate_response(llama_data, prompt, response)) {
return 1;
}
}
return 0;
}
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
if (level == GGML_LOG_LEVEL_ERROR) {
fprintf(stderr, "%s", text);
}
}
static bool is_stdin_a_terminal() {
#if defined(_WIN32)
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
DWORD mode;
return GetConsoleMode(hStdin, &mode);
#else
return isatty(STDIN_FILENO);
#endif
}
static std::string read_pipe_data() {
std::ostringstream result;
result << std::cin.rdbuf(); // Read all data from std::cin
return result.str();
}
int main(int argc, const char ** argv) {
Options opt;
if (parse_arguments(argc, argv, opt)) {
return 1;
}
if (!is_stdin_a_terminal()) {
if (!opt.prompt_non_interactive.empty()) {
opt.prompt_non_interactive += "\n\n";
}
opt.prompt_non_interactive += read_pipe_data();
}
llama_log_set(log_callback, nullptr);
LlamaData llama_data;
if (llama_data.init(opt)) {
return 1;
}
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
return 1;
}
return 0;
}

View file

@ -412,7 +412,7 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.

View file

@ -81,7 +81,13 @@
<path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/> <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
</svg> </svg>
</button> </button>
<button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
<!-- download conversation button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
<path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
<path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
</svg>
</button>
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating"> <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
<!-- edit config button --> <!-- edit config button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16"> <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@ -526,6 +532,23 @@
this.fetchMessages(); this.fetchMessages();
} }
}, },
downloadConv(convId) {
const conversation = StorageUtils.getOneConversation(convId);
if (!conversation) {
alert('Conversation not found.');
return;
}
const conversationJson = JSON.stringify(conversation, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
async sendMessage() { async sendMessage() {
if (!this.inputMsg) return; if (!this.inputMsg) return;
const currConvId = this.viewingConvId; const currConvId = this.viewingConvId;

View file

@ -2,10 +2,11 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "sampling.h"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"
#include "log.h"
#include "sampling.h"
#include "speculative.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
@ -110,7 +111,7 @@ struct server_static_file {
struct slot_params { struct slot_params {
bool stream = true; bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@ -121,12 +122,21 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
struct common_params_sampling sampling;
struct common_params_speculative speculative;
}; };
struct server_slot { struct server_slot {
int id; int id;
int id_task = -1; int id_task = -1;
llama_batch batch_spec;
llama_context * ctx_dft = nullptr;
common_speculative * spec = nullptr;
// the index relative to completion multi-task request // the index relative to completion multi-task request
size_t index = 0; size_t index = 0;
@ -175,7 +185,6 @@ struct server_slot {
// sampling // sampling
json json_schema; json json_schema;
struct common_params_sampling sparams;
struct common_sampler * smpl = nullptr; struct common_sampler * smpl = nullptr;
llama_token sampled; llama_token sampled;
@ -212,7 +221,7 @@ struct server_slot {
generated_token_probs.clear(); generated_token_probs.clear();
} }
bool has_budget(common_params &global_params) { bool has_budget(const common_params & global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1) { if (params.n_predict == -1 && global_params.n_predict == -1) {
return true; // limitless return true; // limitless
} }
@ -232,6 +241,10 @@ struct server_slot {
return state != SLOT_STATE_IDLE; return state != SLOT_STATE_IDLE;
} }
bool can_speculate() const {
return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
}
void add_token(const completion_token_output & token) { void add_token(const completion_token_output & token) {
if (!is_processing()) { if (!is_processing()) {
SLT_WRN(*this, "%s", "slot is not processing\n"); SLT_WRN(*this, "%s", "slot is not processing\n");
@ -591,11 +604,14 @@ struct server_response {
}; };
struct server_context { struct server_context {
common_params params_base;
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
std::vector<common_lora_adapter_container> loras; std::vector<common_lora_adapter_container> loras;
common_params params; llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
llama_batch batch = {}; llama_batch batch = {};
@ -628,27 +644,41 @@ struct server_context {
model = nullptr; model = nullptr;
} }
if (model_dft) {
llama_free_model(model_dft);
model_dft = nullptr;
}
// Clear any sampling context // Clear any sampling context
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.smpl != nullptr) {
common_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} slot.smpl = nullptr;
llama_free(slot.ctx_dft);
slot.ctx_dft = nullptr;
common_speculative_free(slot.spec);
slot.spec = nullptr;
llama_batch_free(slot.batch_spec);
} }
llama_batch_free(batch); llama_batch_free(batch);
} }
bool load_model(const common_params & params_) { bool load_model(const common_params & params) {
params = params_; SRV_INF("loading model '%s'\n", params.model.c_str());
common_init_result llama_init = common_init_from_params(params); params_base = params;
common_init_result llama_init = common_init_from_params(params_base);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
loras = llama_init.lora_adapters; loras = llama_init.lora_adapters;
if (model == nullptr) { if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
return false; return false;
} }
@ -657,6 +687,41 @@ struct server_context {
add_bos_token = llama_add_bos_token(model); add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model); has_eos_token = !llama_add_eos_token(model);
if (!params_base.speculative.model.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.model = params_base.speculative.model;
params_dft.n_ctx = params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
common_init_result llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model;
if (model_dft == nullptr) {
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
llama_free (llama_init_dft.context);
llama_free_model(llama_init_dft.model);
return false;
}
cparams_dft = common_context_params_to_llama(params_base);
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
// the context is not needed - we will create one for each slot
llama_free(llama_init_dft.context);
}
return true; return true;
} }
@ -674,20 +739,36 @@ struct server_context {
} }
void init() { void init() {
const int32_t n_ctx_slot = n_ctx / params.n_parallel; const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
for (int i = 0; i < params.n_parallel; i++) { for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot; server_slot slot;
slot.id = i; slot.id = i;
slot.n_ctx = n_ctx_slot; slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict; slot.n_predict = params_base.n_predict;
if (model_dft) {
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
if (slot.ctx_dft == nullptr) {
SRV_ERR("%s", "failed to create draft context\n");
return;
}
slot.spec = common_speculative_init(slot.ctx_dft);
if (slot.spec == nullptr) {
SRV_ERR("%s", "failed to create speculator\n");
return;
}
}
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
slot.sparams = params.sampling; slot.params.sampling = params_base.sampling;
slot.callback_on_release = [this](int) { slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task(); queue_tasks.pop_deferred_task();
@ -707,7 +788,7 @@ struct server_context {
const int32_t n_batch = llama_n_batch(ctx); const int32_t n_batch = llama_n_batch(ctx);
// only a single seq_id per token is needed // only a single seq_id per token is needed
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
} }
metrics.init(); metrics.init();
@ -786,9 +867,11 @@ struct server_context {
} }
bool launch_slot_with_task(server_slot & slot, const server_task & task) { bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot_params default_params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
auto default_sparams = params.sampling; slot_params defaults;
defaults.sampling = params_base.sampling;
defaults.speculative = params_base.speculative;
const auto & data = task.data; const auto & data = task.data;
if (data.count("__oaicompat") != 0) { if (data.count("__oaicompat") != 0) {
@ -800,41 +883,47 @@ struct server_context {
} }
slot.params.stream = json_value(data, "stream", false); slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false); slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent);
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier);
slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base);
slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep);
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
if (slot.sparams.dry_base < 1.0f) slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
{ slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
slot.sparams.dry_base = default_sparams.dry_base; slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp);
slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range);
slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent);
slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n);
slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat);
slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq);
slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present);
slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier);
slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base);
slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl);
slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
if (slot.params.sampling.dry_base < 1.0f) {
slot.params.sampling.dry_base = defaults.sampling.dry_base;
} }
// sequence breakers for DRY // sequence breakers for DRY
@ -843,8 +932,8 @@ struct server_context {
// Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
if (data.contains("dry_sequence_breakers")) { if (data.contains("dry_sequence_breakers")) {
slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>()); slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
if (slot.sparams.dry_sequence_breakers.empty()) { if (slot.params.sampling.dry_sequence_breakers.empty()) {
send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
@ -859,13 +948,13 @@ struct server_context {
if (data.contains("json_schema") && !data.contains("grammar")) { if (data.contains("json_schema") && !data.contains("grammar")) {
try { try {
auto schema = json_value(data, "json_schema", json::object()); auto schema = json_value(data, "json_schema", json::object());
slot.sparams.grammar = json_schema_to_grammar(schema); slot.params.sampling.grammar = json_schema_to_grammar(schema);
} catch (const std::exception & e) { } catch (const std::exception & e) {
send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
} else { } else {
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
} }
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -875,10 +964,10 @@ struct server_context {
} }
{ {
slot.sparams.logit_bias.clear(); slot.params.sampling.logit_bias.clear();
if (json_value(data, "ignore_eos", false) && has_eos_token) { if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
} }
const auto & logit_bias = data.find("logit_bias"); const auto & logit_bias = data.find("logit_bias");
@ -899,12 +988,12 @@ struct server_context {
if (el[0].is_number_integer()) { if (el[0].is_number_integer()) {
llama_token tok = el[0].get<llama_token>(); llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) { if (tok >= 0 && tok < n_vocab) {
slot.sparams.logit_bias.push_back({tok, bias}); slot.params.sampling.logit_bias.push_back({tok, bias});
} }
} else if (el[0].is_string()) { } else if (el[0].is_string()) {
auto toks = common_tokenize(model, el[0].get<std::string>(), false); auto toks = common_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) { for (auto tok : toks) {
slot.sparams.logit_bias.push_back({tok, bias}); slot.params.sampling.logit_bias.push_back({tok, bias});
} }
} }
} }
@ -935,16 +1024,16 @@ struct server_context {
sampler_names.emplace_back(name); sampler_names.emplace_back(name);
} }
} }
slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
} else if (samplers->is_string()){ } else if (samplers->is_string()){
std::string sampler_string; std::string sampler_string;
for (const auto & name : *samplers) { for (const auto & name : *samplers) {
sampler_string += name; sampler_string += name;
} }
slot.sparams.samplers = common_sampler_types_from_chars(sampler_string); slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
} }
} else { } else {
slot.sparams.samplers = default_sparams.samplers; slot.params.sampling.samplers = defaults.sampling.samplers;
} }
} }
@ -953,7 +1042,7 @@ struct server_context {
common_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} }
slot.smpl = common_sampler_init(model, slot.sparams); slot.smpl = common_sampler_init(model, slot.params.sampling);
if (slot.smpl == nullptr) { if (slot.smpl == nullptr) {
// for now, the only error that may happen here is invalid grammar // for now, the only error that may happen here is invalid grammar
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -961,6 +1050,12 @@ struct server_context {
} }
} }
if (slot.ctx_dft) {
llama_batch_free(slot.batch_spec);
slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
}
slot.state = SLOT_STATE_STARTED; slot.state = SLOT_STATE_STARTED;
SLT_INF(slot, "%s", "processing task\n"); SLT_INF(slot, "%s", "processing task\n");
@ -978,7 +1073,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
@ -1043,7 +1138,7 @@ struct server_context {
} }
// check the limits // check the limits
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
slot.stopped_limit = true; slot.stopped_limit = true;
slot.has_next_token = false; slot.has_next_token = false;
@ -1136,50 +1231,54 @@ struct server_context {
json get_formated_generation(const server_slot & slot) const { json get_formated_generation(const server_slot & slot) const {
std::vector<std::string> samplers; std::vector<std::string> samplers;
samplers.reserve(slot.sparams.samplers.size()); samplers.reserve(slot.params.sampling.samplers.size());
for (const auto & sampler : slot.sparams.samplers) { for (const auto & sampler : slot.params.sampling.samplers) {
samplers.emplace_back(common_sampler_type_to_str(sampler)); samplers.emplace_back(common_sampler_type_to_str(sampler));
} }
return json { return json {
{"n_ctx", slot.n_ctx}, {"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict}, // Server configured n_predict {"n_predict", slot.n_predict}, // Server configured n_predict
{"model", params.model_alias}, {"model", params_base.model_alias},
{"seed", slot.sparams.seed}, {"seed", slot.params.sampling.seed},
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
{"temperature", slot.sparams.temp}, {"temperature", slot.params.sampling.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.params.sampling.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent},
{"top_k", slot.sparams.top_k}, {"top_k", slot.params.sampling.top_k},
{"top_p", slot.sparams.top_p}, {"top_p", slot.params.sampling.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.params.sampling.min_p},
{"xtc_probability", slot.sparams.xtc_probability}, {"xtc_probability", slot.params.sampling.xtc_probability},
{"xtc_threshold", slot.sparams.xtc_threshold}, {"xtc_threshold", slot.params.sampling.xtc_threshold},
{"typical_p", slot.sparams.typ_p}, {"typical_p", slot.params.sampling.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_last_n", slot.params.sampling.penalty_last_n},
{"repeat_penalty", slot.sparams.penalty_repeat}, {"repeat_penalty", slot.params.sampling.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present}, {"presence_penalty", slot.params.sampling.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq}, {"frequency_penalty", slot.params.sampling.penalty_freq},
{"dry_multiplier", slot.sparams.dry_multiplier}, {"dry_multiplier", slot.params.sampling.dry_multiplier},
{"dry_base", slot.sparams.dry_base}, {"dry_base", slot.params.sampling.dry_base},
{"dry_allowed_length", slot.sparams.dry_allowed_length}, {"dry_allowed_length", slot.params.sampling.dry_allowed_length},
{"dry_penalty_last_n", slot.sparams.dry_penalty_last_n}, {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n},
{"dry_sequence_breakers", slot.sparams.dry_sequence_breakers}, {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers},
{"mirostat", slot.sparams.mirostat}, {"mirostat", slot.params.sampling.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_tau", slot.params.sampling.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta}, {"mirostat_eta", slot.params.sampling.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl}, {"penalize_nl", slot.params.sampling.penalize_nl},
{"stop", slot.params.antiprompt}, {"stop", slot.params.antiprompt},
{"max_tokens", slot.params.n_predict}, // User configured n_predict {"max_tokens", slot.params.n_predict}, // User configured n_predict
{"n_keep", slot.params.n_keep}, {"n_keep", slot.params.n_keep},
{"n_discard", slot.params.n_discard}, {"n_discard", slot.params.n_discard},
{"ignore_eos", slot.sparams.ignore_eos}, {"ignore_eos", slot.params.sampling.ignore_eos},
{"stream", slot.params.stream}, {"stream", slot.params.stream},
//{"logit_bias", slot.sparams.logit_bias}, //{"logit_bias", slot.params.sampling.logit_bias},
{"n_probs", slot.sparams.n_probs}, {"n_probs", slot.params.sampling.n_probs},
{"min_keep", slot.sparams.min_keep}, {"min_keep", slot.params.sampling.min_keep},
{"grammar", slot.sparams.grammar}, {"grammar", slot.params.sampling.grammar},
{"samplers", samplers}, {"samplers", samplers},
{"speculative", slot.can_speculate()},
{"speculative.n_max", slot.params.speculative.n_max},
{"speculative.n_min", slot.params.speculative.n_min},
{"speculative.p_min", slot.params.speculative.p_min},
}; };
} }
@ -1216,7 +1315,7 @@ struct server_context {
{"index", slot.index}, {"index", slot.index},
}; };
if (slot.sparams.n_probs > 0) { if (slot.params.sampling.n_probs > 0) {
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1249,7 +1348,7 @@ struct server_context {
{"content", !slot.params.stream ? slot.generated_text : ""}, {"content", !slot.params.stream ? slot.generated_text : ""},
{"id_slot", slot.id}, {"id_slot", slot.id},
{"stop", true}, {"stop", true},
{"model", params.model_alias}, {"model", params_base.model_alias},
{"tokens_predicted", slot.n_decoded}, {"tokens_predicted", slot.n_decoded},
{"tokens_evaluated", slot.n_prompt_tokens}, {"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)}, {"generation_settings", get_formated_generation(slot)},
@ -1265,7 +1364,7 @@ struct server_context {
{"index", slot.index}, {"index", slot.index},
}; };
if (slot.sparams.n_probs > 0) { if (slot.params.sampling.n_probs > 0) {
std::vector<completion_token_output> probs; std::vector<completion_token_output> probs;
if (!slot.params.stream && slot.stopped_word) { if (!slot.params.stream && slot.stopped_word) {
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1422,10 +1521,10 @@ struct server_context {
data.at("input_prefix"), data.at("input_prefix"),
data.at("input_suffix"), data.at("input_suffix"),
data.at("input_extra"), data.at("input_extra"),
params.n_batch, params_base.n_batch,
params.n_predict, params_base.n_predict,
slots[0].n_ctx, // TODO: there should be a better way slots[0].n_ctx, // TODO: there should be a better way
params.spm_infill, params_base.spm_infill,
tokenized_prompts[i] tokenized_prompts[i]
); );
create_task(data, tokens); create_task(data, tokens);
@ -1798,7 +1897,7 @@ struct server_context {
// TODO: simplify and improve // TODO: simplify and improve
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
if (!params.ctx_shift) { if (!params_base.ctx_shift) {
// this check is redundant (for good) // this check is redundant (for good)
// we should never get here, because generation should already stopped in process_token() // we should never get here, because generation should already stopped in process_token()
slot.release(); slot.release();
@ -1864,7 +1963,7 @@ struct server_context {
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
// next, batch any pending prompts without exceeding n_batch // next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0) { if (params_base.cont_batching || batch.n_tokens == 0) {
for (auto & slot : slots) { for (auto & slot : slots) {
// this slot still has a prompt to be processed // this slot still has a prompt to be processed
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@ -1917,7 +2016,7 @@ struct server_context {
continue; continue;
} }
} else { } else {
if (!params.ctx_shift) { if (!params_base.ctx_shift) {
// if context shift is disabled, we make sure prompt size is smaller than KV size // if context shift is disabled, we make sure prompt size is smaller than KV size
// TODO: there should be a separate parameter that control prompt truncation // TODO: there should be a separate parameter that control prompt truncation
// context shift should be applied only during the generation phase // context shift should be applied only during the generation phase
@ -1963,11 +2062,11 @@ struct server_context {
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
// reuse chunks from the cached prompt by shifting their KV cache in the new position // reuse chunks from the cached prompt by shifting their KV cache in the new position
if (params.n_cache_reuse > 0) { if (params_base.n_cache_reuse > 0) {
size_t head_c = slot.n_past; // cache size_t head_c = slot.n_past; // cache
size_t head_p = slot.n_past; // current prompt size_t head_p = slot.n_past; // current prompt
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
while (head_c < slot.cache_tokens.size() && while (head_c < slot.cache_tokens.size() &&
head_p < prompt_tokens.size()) { head_p < prompt_tokens.size()) {
@ -1980,7 +2079,7 @@ struct server_context {
n_match++; n_match++;
} }
if (n_match >= (size_t) params.n_cache_reuse) { if (n_match >= (size_t) params_base.n_cache_reuse) {
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
//for (size_t i = head_p; i < head_p + n_match; i++) { //for (size_t i = head_p; i < head_p + n_match; i++) {
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2168,8 +2267,14 @@ struct server_context {
continue; // continue loop of slots continue; // continue loop of slots
} }
llama_token id;
{
completion_token_output result; completion_token_output result;
const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
slot.i_batch = -1;
common_sampler_accept(slot.smpl, id, true); common_sampler_accept(slot.smpl, id, true);
@ -2184,7 +2289,7 @@ struct server_context {
const auto * cur_p = common_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
cur_p->data[i].id, cur_p->data[i].id,
i >= cur_p->size ? 0.0f : cur_p->data[i].p, i >= cur_p->size ? 0.0f : cur_p->data[i].p,
@ -2197,9 +2302,64 @@ struct server_context {
slot.print_timings(); slot.print_timings();
send_final_response(slot); send_final_response(slot);
metrics.on_prediction(slot); metrics.on_prediction(slot);
continue;
}
} }
slot.i_batch = -1; // check if the slot supports speculative decoding
if (!slot.can_speculate()) {
continue;
}
struct common_speculative_params params_spec;
params_spec.n_draft = slot.params.speculative.n_max;
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
continue;
}
// construct the speculation batch
common_batch_clear(slot.batch_spec);
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
for (size_t i = 0; i < draft.size(); ++i) {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
llama_decode(ctx, slot.batch_spec);
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
slot.n_past += ids.size();
slot.n_decoded += ids.size();
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
if (!process_token(result, slot)) {
// release slot because of stop condition
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
break;
}
}
SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
} }
} }
@ -2697,7 +2857,7 @@ int main(int argc, char ** argv) {
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
json data = { json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel }, { "total_slots", ctx_server.params_base.n_parallel },
{ "chat_template", llama_get_chat_template(ctx_server.model) }, { "chat_template", llama_get_chat_template(ctx_server.model) },
}; };
@ -2705,7 +2865,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.endpoint_props) { if (!ctx_server.params_base.endpoint_props) {
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2718,7 +2878,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
if (ctx_server.params.embedding) { if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2824,7 +2984,7 @@ int main(int argc, char ** argv) {
// TODO: maybe merge this function with "handle_completions_generic" // TODO: maybe merge this function with "handle_completions_generic"
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) { if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -3001,7 +3161,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.reranking || ctx_server.params.embedding) { if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }

View file

@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
} }
}, nullptr); }, nullptr);
// load dynamic backends
ggml_backend_load_all();
// initialize the model // initialize the model
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl; model_params.n_gpu_layers = ngl;

View file

@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
} }
} }
// load dynamic backends
ggml_backend_load_all();
// initialize the model // initialize the model
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();

View file

@ -46,6 +46,7 @@ int main(int argc, char ** argv) {
ctx_tgt = llama_init_tgt.context; ctx_tgt = llama_init_tgt.context;
// load the draft model // load the draft model
params.devices = params.speculative.devices;
params.model = params.speculative.model; params.model = params.speculative.model;
params.n_ctx = params.speculative.n_ctx; params.n_ctx = params.speculative.n_ctx;
params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch; params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;

View file

@ -76,6 +76,7 @@ int main(int argc, char ** argv) {
ctx_tgt = llama_init_tgt.context; ctx_tgt = llama_init_tgt.context;
// load the draft model // load the draft model
params.devices = params.speculative.devices;
params.model = params.speculative.model; params.model = params.speculative.model;
params.n_gpu_layers = params.speculative.n_gpu_layers; params.n_gpu_layers = params.speculative.n_gpu_layers;
if (params.speculative.cpuparams.n_threads > 0) { if (params.speculative.cpuparams.n_threads > 0) {

View file

@ -33,6 +33,7 @@ else()
endif() endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
# #
# option list # option list

View file

@ -190,6 +190,14 @@ extern "C" {
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array) // Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
struct ggml_backend_feature {
const char * name;
const char * value;
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
// //
// Backend registry // Backend registry
@ -214,6 +222,13 @@ extern "C" {
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void); GGML_API ggml_backend_t ggml_backend_init_best(void);
// Load a backend from a dynamic library and register it
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
// Unload a backend if loaded dynamically and unregister it
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
// //
// Backend scheduler // Backend scheduler
// //

View file

@ -7,29 +7,6 @@
extern "C" { extern "C" {
#endif #endif
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -75,12 +52,9 @@ extern "C" {
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
@ -104,10 +78,10 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void); GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void); GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);

View file

@ -2215,6 +2215,37 @@ extern "C" {
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
// ggml threadpool
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
// the goal should be to create an API that other backends can use move everything to the ggml base
// scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -202,6 +202,10 @@ endif()
# ggml # ggml
if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
endif()
add_library(ggml-base add_library(ggml-base
../include/ggml.h ../include/ggml.h
../include/ggml-alloc.h ../include/ggml-alloc.h
@ -226,6 +230,31 @@ add_library(ggml
target_link_libraries(ggml PUBLIC ggml-base) target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_libraries(ggml PRIVATE dl)
endif()
function(ggml_add_backend_library backend)
if (GGML_BACKEND_DL)
add_library(${backend} MODULE ${ARGN})
# write the shared library to the output directory
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})
install(TARGETS ${backend} LIBRARY)
endif()
target_link_libraries(${backend} PRIVATE ggml-base)
target_include_directories(${backend} PRIVATE ..)
if (${BUILD_SHARED_LIBS})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
endif()
endfunction()
function(ggml_add_backend backend) function(ggml_add_backend backend)
string(TOUPPER "GGML_${backend}" backend_id) string(TOUPPER "GGML_${backend}" backend_id)
if (${backend_id}) if (${backend_id})
@ -236,16 +265,12 @@ function(ggml_add_backend backend)
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
if (${backend_id}) if (${backend_id})
message(STATUS "Including ${backend} backend") message(STATUS "Including ${backend} backend")
if (${BUILD_SHARED_LIBS}) if (NOT GGML_BACKEND_DL)
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
endif()
install(TARGETS ${backend_target} LIBRARY)
target_link_libraries(ggml PUBLIC ${backend_target})
string(TOUPPER "GGML_USE_${backend}" backend_use) string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use}) target_compile_definitions(ggml PUBLIC ${backend_use})
endif() endif()
endif() endif()
endif()
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
@ -256,10 +281,10 @@ ggml_add_backend(CUDA)
ggml_add_backend(HIP) ggml_add_backend(HIP)
ggml_add_backend(Kompute) ggml_add_backend(Kompute)
ggml_add_backend(METAL) ggml_add_backend(METAL)
ggml_add_backend(MUSA)
ggml_add_backend(RPC) ggml_add_backend(RPC)
ggml_add_backend(SYCL) ggml_add_backend(SYCL)
ggml_add_backend(Vulkan) ggml_add_backend(Vulkan)
ggml_add_backend(MUSA)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View file

@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA
file(GLOB GGML_SOURCES_AMX "*.cpp") file(GLOB GGML_SOURCES_AMX "*.cpp")
add_library(ggml-amx ggml_add_backend_library(ggml-amx
${GGML_HEADERS_AMX} ${GGML_HEADERS_AMX}
${GGML_SOURCES_AMX}) ${GGML_SOURCES_AMX}
)
target_link_libraries(ggml-amx PRIVATE ggml-base)
target_include_directories(ggml-amx PRIVATE . ..)
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend # TODO: integrate AMX backend into the CPU backend

View file

@ -409,6 +409,7 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
ggml_backend_reg_t ggml_backend_amx_reg(void) { ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = { static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_amx_reg_i, /* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
} }
#endif #endif
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)

View file

@ -8,6 +8,8 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_BACKEND_API_VERSION 1
// //
// Backend buffer type // Backend buffer type
// //
@ -63,20 +65,20 @@ extern "C" {
enum ggml_backend_buffer_usage usage; enum ggml_backend_buffer_usage usage;
}; };
ggml_backend_buffer_t ggml_backend_buffer_init( GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft, ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface, struct ggml_backend_buffer_i iface,
void * context, void * context,
size_t size); size_t size);
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// multi-buffer // multi-buffer
// buffer that contains a collection of buffers // buffer that contains a collection of buffers
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend (stream) // Backend (stream)
@ -199,17 +201,37 @@ extern "C" {
}; };
struct ggml_backend_reg { struct ggml_backend_reg {
// int api_version; // TODO: for dynamic loading int api_version; // initialize to GGML_BACKEND_API_VERSION
struct ggml_backend_reg_i iface; struct ggml_backend_reg_i iface;
void * context; void * context;
}; };
// Internal backend registry API // Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg); GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device); GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
// typedef ggml_backend_register_t * (*ggml_backend_init)(void); // Add backend dynamic loading support to the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
#ifdef GGML_BACKEND_DL
#ifdef __cplusplus
# define GGML_BACKEND_DL_IMPL(reg_fn) \
extern "C" { \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
} \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
#else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
#endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1,11 +1,29 @@
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include <algorithm>
#include <cstring> #include <cstring>
#include <string>
#include <vector> #include <vector>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#elif defined(__APPLE__)
# include <mach-o/dyld.h>
# include <dlfcn.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
// Backend registry // Backend registry
#ifdef GGML_USE_CPU
#include "ggml-cpu.h"
#endif
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
@ -43,8 +61,13 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
void * handle;
};
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends; std::vector<ggml_backend_reg_entry> backends;
std::vector<ggml_backend_dev_t> devices; std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() { ggml_backend_registry() {
@ -75,11 +98,19 @@ struct ggml_backend_registry {
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg()); register_backend(ggml_backend_kompute_reg());
#endif #endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg()); register_backend(ggml_backend_cpu_reg());
#endif
} }
void register_backend(ggml_backend_reg_t reg) { ~ggml_backend_registry() {
while (!backends.empty()) {
// use silent since the log system may have been destroyed at this point
unload_backend(backends.back().reg, true);
}
}
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
if (!reg) { if (!reg) {
return; return;
} }
@ -88,7 +119,7 @@ struct ggml_backend_registry {
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif #endif
backends.push_back(reg); backends.push_back({ reg, handle });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i)); register_device(ggml_backend_reg_dev_get(reg, i));
} }
@ -100,6 +131,111 @@ struct ggml_backend_registry {
#endif #endif
devices.push_back(device); devices.push_back(device);
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) {
#ifdef _WIN32
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryA(path);
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
}
SetErrorMode(old_mode);
return nullptr;
}
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
SetErrorMode(old_mode);
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
}
FreeLibrary(handle);
return nullptr;
}
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
}
return nullptr;
}
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
}
dlclose(handle);
return nullptr;
}
#endif
ggml_backend_reg_t reg = backend_init();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) {
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
} else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
}
}
#ifdef _WIN32
FreeLibrary(handle);
#else
dlclose(handle);
#endif
return nullptr;
}
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, handle);
return reg;
}
void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(),
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
if (it == backends.end()) {
if (!silent) {
GGML_LOG_ERROR("%s: backend not found\n", __func__);
}
return;
}
if (!silent) {
GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
}
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end());
// unload library
if (it->handle) {
#ifdef _WIN32
FreeLibrary((HMODULE) it->handle);
#else
dlclose(it->handle);
#endif
}
// remove backend
backends.erase(it);
}
}; };
static ggml_backend_registry & get_reg() { static ggml_backend_registry & get_reg() {
@ -117,23 +253,32 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
} }
// Backend (reg) enumeration // Backend (reg) enumeration
static bool striequals(const char * a, const char * b) {
for (; *a && *b; a++, b++) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
}
return *a == *b;
}
size_t ggml_backend_reg_count() { size_t ggml_backend_reg_count() {
return get_reg().backends.size(); return get_reg().backends.size();
} }
ggml_backend_reg_t ggml_backend_reg_get(size_t index) { ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count()); GGML_ASSERT(index < ggml_backend_reg_count());
return get_reg().backends[index]; return get_reg().backends[index].reg;
} }
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_reg_count(); i++) { for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
ggml_backend_reg_t reg = ggml_backend_reg_get(i); ggml_backend_reg_t reg = ggml_backend_reg_get(i);
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) { if (striequals(ggml_backend_reg_name(reg), name)) {
return reg; return reg;
} }
} }
return NULL; return nullptr;
} }
// Device enumeration // Device enumeration
@ -149,11 +294,11 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (strcmp(ggml_backend_dev_name(dev), name) == 0) { if (striequals(ggml_backend_dev_name(dev), name)) {
return dev; return dev;
} }
} }
return NULL; return nullptr;
} }
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
@ -163,14 +308,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
return dev; return dev;
} }
} }
return NULL; return nullptr;
} }
// Convenience functions // Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, params); return ggml_backend_dev_init(dev, params);
} }
@ -178,7 +323,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, params); return ggml_backend_dev_init(dev, params);
} }
@ -189,7 +334,97 @@ ggml_backend_t ggml_backend_init_best(void) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
} }
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, NULL); return ggml_backend_dev_init(dev, nullptr);
}
// Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false);
}
void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true);
}
void ggml_backend_load_all() {
std::vector<std::string> search_prefix;
// add the executable directory to the search path
// FIXME: this is convenient for development, but it should probably be disabled in production
#if defined(__APPLE__)
// get executable path
std::vector<char> path;
uint32_t size;
while (true) {
size = path.size();
if (_NSGetExecutablePath(path.data(), &size) == 0) {
break;
}
path.resize(size);
}
std::string base_path(path.data(), size);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
search_prefix.push_back(base_path + "/");
#elif defined(__linux__)
std::string base_path = ".";
std::vector<char> path(1024);
while (true) {
// get executable path
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
if (len == -1) {
break;
}
if (len < (ssize_t) path.size()) {
base_path = std::string(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
break;
}
path.resize(path.size() * 2);
}
search_prefix.push_back(base_path + "/");
#endif
auto & reg = get_reg();
auto try_load = [&](const std::string & name) {
std::string os_name;
#ifdef _WIN32
os_name = "ggml-" + name + ".dll";
#else
os_name = "libggml-" + name + ".so";
#endif
if (reg.load_backend(os_name.c_str(), true)) {
return;
}
for (const auto & prefix : search_prefix) {
if (reg.load_backend((prefix + os_name).c_str(), true)) {
return;
}
}
};
try_load("amx");
try_load("blas");
try_load("cann");
try_load("cuda");
try_load("hip");
try_load("kompute");
try_load("metal");
try_load("rpc");
try_load("sycl");
try_load("vulkan");
try_load("musa");
try_load("cpu");
} }

View file

@ -11,13 +11,10 @@ find_package(BLAS)
if (BLAS_FOUND) if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
add_library(ggml-blas ggml_add_backend_library(ggml-blas
ggml-blas.cpp ggml-blas.cpp
) )
target_link_libraries(ggml-blas PRIVATE ggml-base)
target_include_directories(ggml-blas PRIVATE . ..)
if (${GGML_BLAS_VENDOR} MATCHES "Apple") if (${GGML_BLAS_VENDOR} MATCHES "Apple")
add_compile_definitions(ACCELERATE_NEW_LAPACK) add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64) add_compile_definitions(ACCELERATE_LAPACK_ILP64)

View file

@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
ggml_backend_reg_t ggml_backend_blas_reg(void) { ggml_backend_reg_t ggml_backend_blas_reg(void) {
static struct ggml_backend_reg ggml_backend_blas_reg = { static struct ggml_backend_reg ggml_backend_blas_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_blas_reg_i, /* .iface = */ ggml_backend_blas_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_blas_reg; return &ggml_backend_blas_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)

View file

@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR)
file(GLOB GGML_SOURCES_CANN "*.cpp") file(GLOB GGML_SOURCES_CANN "*.cpp")
add_library(ggml-cann ${GGML_SOURCES_CANN}) ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES}) target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")

View file

@ -2080,7 +2080,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
dev_ctx->name = GGML_CANN_NAME + std::to_string(i); dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i); ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_cann_device_interface, /* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -2088,7 +2088,8 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_cann_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cann_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -2142,3 +2143,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
ggml_cann_set_device(device); ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)

View file

@ -1,4 +1,4 @@
add_library(ggml-cpu ggml_add_backend_library(ggml-cpu
ggml-cpu.c ggml-cpu.c
ggml-cpu.cpp ggml-cpu.cpp
ggml-cpu-aarch64.c ggml-cpu-aarch64.c
@ -7,8 +7,7 @@ add_library(ggml-cpu
ggml-cpu-quants.h ggml-cpu-quants.h
) )
target_link_libraries(ggml-cpu PRIVATE ggml-base) target_include_directories(ggml-cpu PRIVATE .)
target_include_directories(ggml-cpu PRIVATE . ..)
if (APPLE AND GGML_ACCELERATE) if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate) find_library(ACCELERATE_FRAMEWORK Accelerate)

View file

@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
#endif // GGML_USE_OPENMP #endif // GGML_USE_OPENMP
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}
static struct ggml_threadpool * ggml_threadpool_new_impl( static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_threadpool_params * tpp, struct ggml_threadpool_params * tpp,
struct ggml_cgraph * cgraph, struct ggml_cgraph * cgraph,

View file

@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
return &ggml_backend_cpu_device; return &ggml_backend_cpu_device;
} }
struct ggml_backend_feature {
const char * name;
const char * value;
};
// Not used yet
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
// and additionally to allow other backends to expose their own list of features that applications can query using the same API. // and additionally to allow other backends to expose their own list of features that applications can query using the same API
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() { static std::vector<ggml_backend_feature> features = []() {
ggml_cpu_init();
std::vector<ggml_backend_feature> features; std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse3()) { if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" }); features.push_back({ "SSE3", "1" });
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_avx()) { if (ggml_cpu_has_avx()) {
features.push_back({ "AVX", "1" }); features.push_back({ "AVX", "1" });
} }
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx2()) { if (ggml_cpu_has_avx2()) {
features.push_back({ "AVX2", "1" }); features.push_back({ "AVX2", "1" });
} }
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_fma()) { if (ggml_cpu_has_fma()) {
features.push_back({ "FMA", "1" }); features.push_back({ "FMA", "1" });
} }
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx512()) { if (ggml_cpu_has_avx512()) {
features.push_back({ "AVX512", "1" }); features.push_back({ "AVX512", "1" });
} }
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_llamafile()) { if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" }); features.push_back({ "LLAMAFILE", "1" });
} }
// TODO: rename this
#ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" });
#endif
features.push_back({ nullptr, nullptr }); features.push_back({ nullptr, nullptr });
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts; return (void *)ggml_backend_cpu_get_extra_bufts;
} }
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cpu_get_features;
}
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
return (void *)ggml_backend_cpu_set_abort_callback;
}
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
return (void *)ggml_numa_init;
}
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {
return (void *)ggml_threadpool_new;
}
if (strcmp(name, "ggml_threadpool_free") == 0) {
return (void *)ggml_threadpool_free;
}
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
return (void *)ggml_backend_cpu_set_threadpool;
}
return NULL; return NULL;
@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
ggml_cpu_init(); ggml_cpu_init();
static struct ggml_backend_reg ggml_backend_cpu_reg = { static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cpu_reg_i, /* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_cpu_reg; return &ggml_backend_cpu_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)

View file

@ -46,14 +46,11 @@ if (CUDAToolkit_FOUND)
list(APPEND GGML_SOURCES_CUDA ${SRCS}) list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif() endif()
add_library(ggml-cuda ggml_add_backend_library(ggml-cuda
${GGML_HEADERS_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_CUDA} ${GGML_SOURCES_CUDA}
) )
target_link_libraries(ggml-cuda PRIVATE ggml-base)
target_include_directories(ggml-cuda PRIVATE . ..)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
if (GGML_CUDA_GRAPHS) if (GGML_CUDA_GRAPHS)

View file

@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
return ctx->devices[index]; return ctx->devices[index];
} }
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
std::vector<ggml_backend_feature> features;
#define _STRINGIFY(...) #__VA_ARGS__
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
#ifdef __CUDA_ARCH_LIST__
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
#endif
#ifdef GGML_CUDA_FORCE_MMQ
features.push_back({ "FORCE_MMQ", "1" });
#endif
#ifdef GGML_CUDA_FORCE_CUBLAS
features.push_back({ "FORCE_CUBLAS", "1" });
#endif
#ifdef GGML_CUDA_NO_VMM
features.push_back({ "NO_VMM", "1" });
#endif
#ifdef GGML_CUDA_NO_PEER_COPY
features.push_back({ "NO_PEER_COPY", "1" });
#endif
#ifdef GGML_CUDA_F16
features.push_back({ "F16", "1" });
#endif
#ifdef GGML_CUDA_USE_GRAPHS
features.push_back({ "USE_GRAPHS", "1" });
#endif
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
#endif
#ifdef GGML_CUDA_FA_ALL_QUANTS
features.push_back({ "FA_ALL_QUANTS", "1" });
#endif
#undef _STRINGIFY
#undef STRINGIFY
features.push_back({ nullptr, nullptr });
return features;
}();
return features.data();
GGML_UNUSED(reg);
}
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg); GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
return (void *)ggml_backend_cuda_unregister_host_buffer; return (void *)ggml_backend_cuda_unregister_host_buffer;
} }
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cuda_get_features;
}
return nullptr; return nullptr;
} }
@ -3169,7 +3227,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -3177,7 +3235,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_cuda_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cuda_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
return cuda_backend; return cuda_backend;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)

View file

@ -64,12 +64,10 @@ else()
list(APPEND GGML_SOURCES_ROCM ${SRCS}) list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif() endif()
add_library(ggml-hip ggml_add_backend_library(ggml-hip
${GGML_HEADERS_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_ROCM}) ${GGML_SOURCES_ROCM}
)
target_link_libraries(ggml-hip PRIVATE ggml-base)
target_include_directories(ggml-hip PRIVATE . ..)
# TODO: do not use CUDA definitions for HIP # TODO: do not use CUDA definitions for HIP
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)

View file

@ -6,13 +6,13 @@ if (NOT glslc_executable)
message(FATAL_ERROR "glslc not found") message(FATAL_ERROR "glslc not found")
endif() endif()
add_library(ggml-kompute ggml_add_backend_library(ggml-kompute
ggml-kompute.cpp ggml-kompute.cpp
../../include/ggml-kompute.h ../../include/ggml-kompute.h
) )
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)

View file

@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
ggml_backend_reg_t ggml_backend_kompute_reg() { ggml_backend_reg_t ggml_backend_kompute_reg() {
static ggml_backend_reg reg = { static ggml_backend_reg reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_kompute_reg_i, /* .iface = */ ggml_backend_kompute_reg_i,
/* .context = */ nullptr, /* .context = */ nullptr,
}; };
return &reg; return &reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)

View file

@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found") message(STATUS "Metal framework found")
add_library(ggml-metal ggml_add_backend_library(ggml-metal
ggml-metal.m ggml-metal.m
) )
target_link_libraries(ggml-metal PRIVATE target_link_libraries(ggml-metal PRIVATE
ggml-base
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK} ${METALKIT_FRAMEWORK}
) )
target_include_directories(ggml-metal PRIVATE . ..)
if (GGML_METAL_NDEBUG) if (GGML_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG)
endif() endif()

View file

@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = 1; int ne11_mm_min = 4;
#if 0 #if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models // the numbers below are measured on M2 Ultra for 7B and 13B models
@ -4372,17 +4372,43 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
GGML_UNUSED(index); GGML_UNUSED(index);
} }
static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
#if defined(GGML_METAL_EMBED_LIBRARY)
{ "EMBED_LIBRARY", "1" },
#endif
#if defined(GGML_METAL_USE_BF16)
{ "BF16", "1" },
#endif
{ nil, nil },
};
static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
return g_ggml_backend_metal_features;
GGML_UNUSED(reg);
}
static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_metal_get_features;
}
return NULL;
GGML_UNUSED(reg);
}
static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
/* .get_name = */ ggml_backend_metal_reg_get_name, /* .get_name = */ ggml_backend_metal_reg_get_name,
/* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_count = */ ggml_backend_metal_reg_device_count,
/* .device_get = */ ggml_backend_metal_reg_device_get, /* .device_get = */ ggml_backend_metal_reg_device_get,
/* .get_proc_address = */ NULL, /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
}; };
ggml_backend_reg_t ggml_backend_metal_reg(void) { ggml_backend_reg_t ggml_backend_metal_reg(void) {
// TODO: make this thread-safe somehow? // TODO: make this thread-safe somehow?
{ {
g_ggml_backend_metal_reg = (struct ggml_backend_reg) { g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_metal_reg_i, /* .iface = */ ggml_backend_metal_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
return &g_ggml_backend_metal_reg; return &g_ggml_backend_metal_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)

View file

@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
const int im = tgpig.z; const int im = tgpig.z;
// if this block is of 64x32 shape or smaller // if this block is of 64x32 shape or smaller
short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
// a thread shouldn't load data outside of the matrix // a thread shouldn't load data outside of the matrix
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
simdgroup_T8x8 ma[4]; simdgroup_T8x8 ma[4];
simdgroup_float8x8 mb[2]; simdgroup_float8x8 mb[2];
@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
const int i12 = im%args.ne12; const int i12 = im%args.ne12;
const int i13 = im/args.ne12; const int i13 = im/args.ne12;
uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
short offset1 = il/nl; const short offset1 = il/nl;
device const block_q * x = (device const block_q *)(src0
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
device const float * y = (device const float *)(src1 device const float * y = (device const float *)(src1
+ args.nb13*i13 + args.nb13*i13
+ args.nb12*i12 + args.nb12*i12
+ args.nb11*(r1 * BLOCK_SIZE_N + thread_col) + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
+ args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
// load data and store to threadgroup memory // load data and store to threadgroup memory
T4x4 temp_a; T4x4 temp_a;
dequantize_func(x, il, temp_a); dequantize_func(x, il, temp_a);
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16) #pragma unroll(16)
@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
} }
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
il = (il + 2 < nl) ? il + 2 : il % 2; il = (il + 2 < nl) ? il + 2 : il % 2;
x = (il < 2) ? x + (2+nl-1)/nl : x; x = (il < 2) ? x + (2 + nl - 1)/nl : x;
y += BLOCK_SIZE_K; y += BLOCK_SIZE_K;
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
// load matrices from threadgroup memory and conduct outer products // load matrices from threadgroup memory and conduct outer products
threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
#pragma unroll(4) #pragma unroll(4)
for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
#pragma unroll(4) #pragma unroll(4)
for (short i = 0; i < 4; i++) { for (short i = 0; i < 4; i++) {
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
} }
simdgroup_barrier(mem_flags::mem_none); simdgroup_barrier(mem_flags::mem_none);
#pragma unroll(2) #pragma unroll(2)
for (short i = 0; i < 2; i++) { for (short i = 0; i < 2; i++) {
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
} }
lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
#pragma unroll(8) #pragma unroll(8)
for (short i = 0; i < 8; i++){ for (short i = 0; i < 8; i++){
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
} }
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
} }
} }
if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) { if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
device float * C = (device float *) dst + device float * C = (device float *) dst +
(BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) + \ (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
(BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
for (short i = 0; i < 8; i++) { for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0); simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
// block is smaller than 64x32, we should avoid writing data outside of the matrix // block is smaller than 64x32, we should avoid writing data outside of the matrix
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float * temp_str = ((threadgroup float *) shmem) \ threadgroup float * temp_str = ((threadgroup float *) shmem) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M; + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
for (short i = 0; i < 8; i++) { for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
} }

View file

@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND)
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
endforeach() endforeach()
add_library(ggml-musa ggml_add_backend_library(ggml-musa
${GGML_HEADERS_MUSA} ${GGML_HEADERS_MUSA}
${GGML_SOURCES_MUSA}) ${GGML_SOURCES_MUSA}
)
target_link_libraries(ggml-musa PRIVATE ggml-base)
target_include_directories(ggml-musa PRIVATE . ..)
# TODO: do not use CUDA definitions for MUSA # TODO: do not use CUDA definitions for MUSA
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)

View file

@ -1,10 +1,8 @@
message(STATUS "Using RPC backend") message(STATUS "Using RPC backend")
add_library(ggml-rpc ggml_add_backend_library(ggml-rpc
ggml-rpc.cpp) ggml-rpc.cpp
)
target_link_libraries(ggml-rpc PRIVATE ggml-base)
target_include_directories(ggml-rpc PRIVATE . ..)
if (WIN32) if (WIN32)
target_link_libraries(ggml-rpc PRIVATE ws2_32) target_link_libraries(ggml-rpc PRIVATE ws2_32)

View file

@ -1369,6 +1369,7 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
ggml_backend_reg_t ggml_backend_rpc_reg(void) { ggml_backend_reg_t ggml_backend_rpc_reg(void) {
static struct ggml_backend_reg ggml_backend_rpc_reg = { static struct ggml_backend_reg ggml_backend_rpc_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_rpc_reg_i, /* .iface = */ ggml_backend_rpc_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
return dev; return dev;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)

View file

@ -16,12 +16,10 @@ endif()
message(STATUS "SYCL found") message(STATUS "SYCL found")
#todo: AOT #todo: AOT
add_library(ggml-sycl ggml_add_backend_library(ggml-sycl
ggml-sycl.cpp ggml-sycl.cpp
../../include/ggml-sycl.h) ../../include/ggml-sycl.h
)
target_link_libraries(ggml-sycl PRIVATE ggml-base)
target_include_directories(ggml-sycl PRIVATE . ..)
if (GGML_SYCL_F16) if (GGML_SYCL_F16)
if (GGML_SYCL_TARGET STREQUAL "AMD") if (GGML_SYCL_TARGET STREQUAL "AMD")

View file

@ -4637,7 +4637,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
dev_ctx->description = prop.get_name(); dev_ctx->description = prop.get_name();
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_sycl_device_interface, /* .iface = */ ggml_backend_sycl_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -4645,7 +4645,8 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_sycl_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_sycl_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
return sycl_backend; return sycl_backend;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)

View file

@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED)
if (Vulkan_FOUND) if (Vulkan_FOUND)
message(STATUS "Vulkan found") message(STATUS "Vulkan found")
add_library(ggml-vulkan ggml_add_backend_library(ggml-vulkan
ggml-vulkan.cpp ggml-vulkan.cpp
../../include/ggml-vulkan.h ../../include/ggml-vulkan.h
) )
target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan) target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector

View file

@ -6738,6 +6738,7 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
ggml_backend_reg_t ggml_backend_vk_reg() { ggml_backend_reg_t ggml_backend_vk_reg() {
static ggml_backend_reg reg = { static ggml_backend_reg reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_vk_reg_i, /* .iface = */ ggml_backend_vk_reg_i,
/* .context = */ nullptr, /* .context = */ nullptr,
}; };
@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
} }
#endif #endif
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)

View file

@ -474,9 +474,15 @@ void write_output_files() {
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::map<std::string, std::string> args; std::map<std::string, std::string> args;
for (int i = 1; i < argc; i += 2) { for (int i = 1; i < argc; ++i) {
if (i + 1 < argc) { std::string arg = argv[i];
args[argv[i]] = argv[i + 1]; if (arg.rfind("--", 0) == 0) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
args[arg] = argv[i + 1];
++i;
} else {
args[arg] = "";
}
} }
} }

View file

@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
g_logger_state.log_callback_user_data = user_data; g_logger_state.log_callback_user_data = user_data;
} }
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

View file

@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
COMMAND_R = auto() COMMAND_R = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
OLMO_1124 = auto() OLMO2 = auto()
OLMOE = auto() OLMOE = auto()
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO_1124: "olmo_1124", MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.OLMO_1124: [ MODEL_ARCH.OLMO2: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,

View file

@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo_1124 "model.embed_tokens", # llama-hf nemotron olmoe olmo2
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
@ -54,7 +54,7 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124 "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
@ -66,7 +66,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124 "model.norm", # llama-hf baichuan internlm2 olmoe olmo2
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
@ -145,7 +145,7 @@ class TensorNameMap:
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
@ -157,7 +157,7 @@ class TensorNameMap:
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
@ -170,7 +170,7 @@ class TensorNameMap:
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
@ -188,7 +188,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
@ -215,7 +215,7 @@ class TensorNameMap:
), ),
MODEL_TENSOR.ATTN_POST_NORM: ( MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124 "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
), ),
# Rotary embeddings # Rotary embeddings
@ -250,7 +250,7 @@ class TensorNameMap:
# Post feed-forward norm # Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: ( MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124 "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
@ -273,7 +273,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom "h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124 "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.fc_in", # gpt-j
@ -314,7 +314,7 @@ class TensorNameMap:
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124 "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
"layers.{bid}.feed_forward.w1", # llama-pth "layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
"transformer.h.{bid}.mlp.c_fc2", # jais "transformer.h.{bid}.mlp.c_fc2", # jais
@ -346,7 +346,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom "h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124 "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
@ -383,7 +383,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124 "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm "transformer.layers.{bid}.attn.q_norm", # openelm
@ -392,7 +392,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124 "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm "transformer.layers.{bid}.attn.k_norm", # openelm

25
include/llama-cpp.h Normal file
View file

@ -0,0 +1,25 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include <memory>
#include "llama.h"
struct llama_model_deleter {
void operator()(llama_model * model) { llama_free_model(model); }
};
struct llama_context_deleter {
void operator()(llama_context * context) { llama_free(context); }
};
struct llama_sampler_deleter {
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;

View file

@ -272,6 +272,9 @@ extern "C" {
}; };
struct llama_model_params { struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices;
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs

View file

@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
if (NOT GGML_BACKEND_DL)
add_subdirectory(vdot) add_subdirectory(vdot)
endif()
endif() endif()

View file

@ -179,7 +179,7 @@ enum llm_arch {
LLM_ARCH_COMMAND_R, LLM_ARCH_COMMAND_R,
LLM_ARCH_DBRX, LLM_ARCH_DBRX,
LLM_ARCH_OLMO, LLM_ARCH_OLMO,
LLM_ARCH_OLMO_1124, LLM_ARCH_OLMO2,
LLM_ARCH_OLMOE, LLM_ARCH_OLMOE,
LLM_ARCH_OPENELM, LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMO_1124, "olmo_1124" }, { LLM_ARCH_OLMO2, "olmo2" },
{ LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
}, },
}, },
{ {
LLM_ARCH_OLMO_1124, LLM_ARCH_OLMO2,
{ {
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
@ -4866,7 +4866,9 @@ struct llama_model_loader {
mappings.reserve(files.size()); mappings.reserve(files.size());
mmaps_used.reserve(files.size()); mmaps_used.reserve(files.size());
for (const auto & file : files) { for (const auto & file : files) {
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
mmaps_used.emplace_back(mapping->size, 0); mmaps_used.emplace_back(mapping->size, 0);
if (mlock_mmaps) { if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@ -5898,7 +5900,7 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -8591,7 +8593,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
} }
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) { if (!dev) {
// FIXME: workaround for CPU backend buft having a NULL device // FIXME: workaround for CPU backend buft having a NULL device
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
} }
ggml_backend_dev_props props; ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props); ggml_backend_dev_get_props(dev, &props);
@ -14481,7 +14483,7 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_olmo_1124() { struct ggml_cgraph * build_olmo2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens // mutable variable, needed during the last layer of the computation to skip unused tokens
@ -16797,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_olmo(); result = llm.build_olmo();
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
result = llm.build_olmo_1124(); result = llm.build_olmo2();
} break; } break;
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
{ {
@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
int n_threads, int n_threads,
ggml_threadpool * threadpool) { ggml_threadpool * threadpool) {
if (lctx.backend_cpu != nullptr) { if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
set_threadpool_fn(lctx.backend_cpu, threadpool);
} }
// set the number of threads for all the backends // set the number of threads for all the backends
@ -19361,6 +19364,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
// //
struct llama_model_params llama_model_default_params() { struct llama_model_params llama_model_default_params() {
struct llama_model_params result = { struct llama_model_params result = {
/*.devices =*/ nullptr,
/*.n_gpu_layers =*/ 0, /*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
@ -19478,7 +19482,11 @@ void llama_backend_init(void) {
void llama_numa_init(enum ggml_numa_strategy numa) { void llama_numa_init(enum ggml_numa_strategy numa) {
if (numa != GGML_NUMA_STRATEGY_DISABLED) { if (numa != GGML_NUMA_STRATEGY_DISABLED) {
ggml_numa_init(numa); auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
numa_init_fn(numa);
} }
} }
@ -19569,8 +19577,12 @@ struct llama_model * llama_load_model_from_file(
} }
// create list of devices to use with this model // create list of devices to use with this model
// currently, we use all available devices if (params.devices) {
// TODO: rework API to give user more control over device selection for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back(*dev);
}
} else {
// use all available devices
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) { switch (ggml_backend_dev_type(dev)) {
@ -19584,6 +19596,7 @@ struct llama_model * llama_load_model_from_file(
break; break;
} }
} }
}
// if using single GPU mode, remove all except the main GPU // if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
@ -19752,9 +19765,6 @@ struct llama_context * llama_new_context_with_model(
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
ctx->logits_all = params.logits_all; ctx->logits_all = params.logits_all;
// build worst-case graph for encoder if a model contains encoder // build worst-case graph for encoder if a model contains encoder
@ -19803,7 +19813,7 @@ struct llama_context * llama_new_context_with_model(
} }
// add CPU backend // add CPU backend
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19823,6 +19833,8 @@ struct llama_context * llama_new_context_with_model(
} }
} }
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19868,7 +19880,8 @@ struct llama_context * llama_new_context_with_model(
std::vector<ggml_backend_t> backend_ptrs; std::vector<ggml_backend_t> backend_ptrs;
for (auto & backend : ctx->backends) { for (auto & backend : ctx->backends) {
auto * buft = ggml_backend_get_default_buffer_type(backend.get()); auto * buft = ggml_backend_get_default_buffer_type(backend.get());
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state // use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model->devices[0]; auto * dev = model->devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev); auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@ -19896,7 +19909,8 @@ struct llama_context * llama_new_context_with_model(
// pipeline parallelism requires support for async compute and events in all devices // pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) { if (pipeline_parallel) {
for (auto & backend : ctx->backends) { for (auto & backend : ctx->backends) {
if (ggml_backend_is_cpu(backend.get())) { auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
// ignore CPU backend // ignore CPU backend
continue; continue;
} }
@ -20070,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3: case LLM_ARCH_PHI3:
@ -21450,6 +21464,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback; ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data; ctx->abort_callback_data = abort_callback_data;
for (auto & backend : ctx->backends) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
}
}
} }
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@ -22191,32 +22213,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
} }
const char * llama_print_system_info(void) { const char * llama_print_system_info(void) {
ggml_cpu_init(); // some ARM features are detected at runtime
static std::string s; static std::string s;
s = ""; for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; auto * reg = ggml_backend_reg_get(i);
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; if (get_features_fn) {
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; ggml_backend_feature * features = get_features_fn(reg);
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; s += ggml_backend_reg_name(reg);
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; s += " : ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; for (; features->name; features++) {
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; s += features->name;
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += " = ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; s += features->value;
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; s += " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; }
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; }
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; }
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
return s.c_str(); return s.c_str();
} }

View file

@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
# llama_target_and_test(test-double-float.cpp) # SLOW # llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-log.cpp) llama_target_and_test(test-log.cpp)
llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-arg-parser.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-chat-template.cpp)
llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-llama-grammar.cpp) llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-barrier.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW # llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp) llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model")
if (NOT GGML_BACKEND_DL)
# these tests use the backends directly and cannot be built with dynamic loading
llama_target_and_test(test-barrier.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-rope.cpp)
endif()
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)

View file

@ -16,7 +16,6 @@
#include <ggml.h> #include <ggml.h>
#include <ggml-cpu.h>
#include <ggml-alloc.h> #include <ggml-alloc.h>
#include <ggml-backend.h> #include <ggml-backend.h>
@ -26,7 +25,6 @@
#include <cstdint> #include <cstdint>
#include <cstring> #include <cstring>
#include <cinttypes> #include <cinttypes>
#include <functional>
#include <memory> #include <memory>
#include <random> #include <random>
#include <stdio.h> #include <stdio.h>
@ -639,19 +637,20 @@ struct test_case {
// determine number of runs // determine number of runs
int n_runs; int n_runs;
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
if (op_flops(out) > 0) { if (op_flops(out) > 0) {
// based on flops // based on flops
const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t GFLOP = 1000 * 1000 * 1000;
const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_cpu = 8ULL * GFLOP;
const uint64_t target_flops_gpu = 100ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP;
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
} else { } else {
// based on memory size // based on memory size
const size_t GB = 1ULL << 30; const size_t GB = 1ULL << 30;
const size_t target_size_cpu = 8 * GB; const size_t target_size_cpu = 8 * GB;
const size_t target_size_gpu = 32 * GB; const size_t target_size_gpu = 32 * GB;
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
} }
@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
if (mode == MODE_TEST) { if (mode == MODE_TEST) {
auto test_cases = make_test_cases_eval(); auto test_cases = make_test_cases_eval();
ggml_backend_t backend_cpu = ggml_backend_cpu_init(); ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
if (backend_cpu == NULL) {
printf(" Failed to initialize CPU backend\n");
return false;
}
size_t n_ok = 0; size_t n_ok = 0;
for (auto & test : test_cases) { for (auto & test : test_cases) {
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
} }
} }
// enumerate backends // load and enumerate backends
ggml_backend_load_all();
printf("Testing %zu devices\n\n", ggml_backend_dev_count()); printf("Testing %zu devices\n\n", ggml_backend_dev_count());
size_t n_ok = 0; size_t n_ok = 0;
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
continue; continue;
} }
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
GGML_ASSERT(backend != NULL);
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
printf(" Skipping CPU backend\n"); printf(" Skipping CPU backend\n");
ggml_backend_free(backend);
n_ok++; n_ok++;
continue; continue;
} }
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) { if (ggml_backend_set_n_threads_fn) {

View file

@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
} }
// Total dot product error // Total dot product error
static float dot_product_error( static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2 GGML_UNUSED(qfns);
) {
std::vector<uint8_t> tmp_q1(2*test_size); std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size); std::vector<uint8_t> tmp_q2(2*test_size);