Merge upstream changes, implement basic vulkan backend

This commit is contained in:
0cc4m 2024-01-18 16:54:12 +01:00
commit 2d14b22a99
114 changed files with 15366 additions and 5819 deletions

View file

@ -515,6 +515,31 @@ jobs:
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
android-build:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v3
- name: Set up JDK
uses: actions/setup-java@v3
with:
java-version: 17
distribution: zulu
- name: Setup Android SDK
uses: android-actions/setup-android@v3
with:
log-accepted-android-sdk-licenses: false
- name: Build
run: |
cd examples/llama.android
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
./gradlew build --no-daemon -Pskip-armeabi-v7a
# freeBSD-latest:
# runs-on: macos-12
# steps:

55
.github/workflows/nix-ci-aarch64.yml vendored Normal file
View file

@ -0,0 +1,55 @@
name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

View file

@ -69,44 +69,3 @@ jobs:
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

View file

@ -19,4 +19,4 @@ jobs:
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.GITHUB_TOKEN }}
token: ${{ secrets.FLAKE_TOKEN }}

1
.gitignore vendored
View file

@ -43,6 +43,7 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
/imatrix
/infill
/libllama.so
/llama-bench

View file

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.13) # for add_link_options
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -76,6 +76,10 @@ if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
endif()
if (WIN32)
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
@ -607,6 +611,13 @@ if (NOT MSVC)
endif()
endif()
function(add_compile_option_cpp ARG)
# Adds a compile option to C/C++ only, but not for Cuda.
# Use, e.g., for CPU-architecture flags.
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
endfunction()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
@ -641,8 +652,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
include(cmake/FindSIMD.cmake)
endif ()
if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
add_compile_option_cpp(/arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
@ -656,37 +666,35 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
add_compile_option_cpp(/arch:AVX2)
elseif (LLAMA_AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
add_compile_option_cpp(/arch:AVX)
endif()
else()
if (LLAMA_NATIVE)
add_compile_options(-march=native)
add_compile_option_cpp(-march=native)
endif()
if (LLAMA_F16C)
add_compile_options(-mf16c)
add_compile_option_cpp(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_options(-mfma)
add_compile_option_cpp(-mfma)
endif()
if (LLAMA_AVX)
add_compile_options(-mavx)
add_compile_option_cpp(-mavx)
endif()
if (LLAMA_AVX2)
add_compile_options(-mavx2)
add_compile_option_cpp(-mavx2)
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
add_compile_options(-mavx512bw)
add_compile_option_cpp(-mavx512f)
add_compile_option_cpp(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_options(-mavx512vbmi)
add_compile_option_cpp(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_options(-mavx512vnni)
add_compile_option_cpp(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@ -703,7 +711,7 @@ endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=0x602)
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
endif()
#

View file

@ -1,6 +1,6 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
@ -43,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@ -629,6 +625,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -683,11 +682,6 @@ lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)

View file

@ -14,14 +14,14 @@ let package = Package(
.library(name: "llama", targets: ["llama"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [
.target(
name: "llama",
dependencies: ["ggml"],
path: ".",
exclude: [],
exclude: ["ggml-metal.metal"],
sources: [
"llama.cpp",
],

View file

@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
@ -136,6 +137,7 @@ as the main playground for developing new features for the [ggml](https://github
- [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA)
---

View file

@ -43,7 +43,7 @@ Example for llama model
# For llama7b and llama2 models
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
# For mistral and mpt models
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
```
## Quantize

View file

@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
}
} else if (arg == "-td" || arg == "--threads-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
}
} else if (arg == "-tbd" || arg == "--threads-batch-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
}
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
@ -543,9 +561,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#else
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
@ -554,9 +571,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers_draft = std::stoi(argv[i]);
#else
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
@ -565,25 +581,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string arg_next = argv[i];
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_NONE;
} else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_LAYER;
} else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_ROW;
} else {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
invalid_param = true;
break;
}
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
@ -591,14 +626,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f;
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
@ -606,6 +635,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.numa = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "--no-display-prompt") {
params.display_prompt = false;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
@ -630,6 +661,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.ppl_stride = std::stoi(argv[i]);
} else if (arg == "-ptc" || arg == "--print-token-count") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_print = std::stoi(argv[i]);
} else if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
@ -812,7 +849,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@ -826,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -td N, --threads-draft N");
printf(" number of threads to use during generation (default: same as --threads)");
printf(" -tbd N, --threads-batch-draft N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
printf(" -p PROMPT, --prompt PROMPT\n");
printf(" prompt to start generation with (default: empty)\n");
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@ -909,20 +950,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
#ifdef GGML_USE_CUBLAS
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
printf(" how to split the model across multiple GPUs, one of:\n");
printf(" - none: use one GPU only\n");
printf(" - layer (default): split layers and KV across GPUs\n");
printf(" - row: split rows across GPUs\n");
printf(" -ts SPLIT, --tensor-split SPLIT\n");
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#endif
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
printf(" -gat N, --grp-attn-w N\n");
printf(" -gaw N, --grp-attn-w N\n");
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
printf(" --verbose-prompt print prompt before generation\n");
printf(" -dkvc, --dump-kv-cache\n");
printf(" verbose print of the KV cache\n");
printf(" -nkvo, --no-kv-offload\n");
@ -944,6 +987,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print);
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
@ -1033,6 +1078,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
@ -1047,6 +1093,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
}
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
@ -1558,6 +1607,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
}
//

View file

@ -46,7 +46,9 @@ struct gpt_params {
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
@ -59,11 +61,13 @@ struct gpt_params {
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
@ -124,6 +128,7 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
@ -242,4 +247,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

View file

@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl(
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl(
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
if (ctx_cfg) {
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
}
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);

View file

@ -23,6 +23,15 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
# check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys):
for k in keys:
if k in d:
return d[k]
print(f"Could not find any of {keys}")
sys.exit()
###### MODEL DEFINITIONS ######
class SentencePieceTokenTypes(IntEnum):
@ -817,10 +826,17 @@ class PersimmonModel(Model):
hidden_size = self.hparams["hidden_size"]
self.gguf_writer.add_name('persimmon-8b-chat')
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
# than the head size?
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@ -1061,17 +1077,22 @@ class GPT2Model(Model):
class Phi2Model(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(4 * n_embd)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False)

File diff suppressed because it is too large Load diff

View file

@ -36,9 +36,7 @@ else()
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
add_subdirectory(imatrix)
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()

View file

@ -88,7 +88,10 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

View file

@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
// Set up a the compute graph
// printf("Creating new tensor q31\n");
@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

View file

@ -245,9 +245,8 @@ static struct lora_data * load_lora(struct lora_info * info) {
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
uint32_t magic = file.read_u32();
if (magic != LLAMA_FILE_MAGIC_LORA) {
if (magic != LLAMA_FILE_MAGIC_GGLA) {
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
}
uint32_t version = file.read_u32();

View file

@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
return tn_buf.data();
};
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
// write_magic
file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic
file.write_u32(1); // version
// write_hparams
file.write_u32(lora->hparams.lora_r);
@ -1800,7 +1799,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> train_tokens;
std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size;
printf("%s: tokenize training data\n", __func__);
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
tokenize_file(lctx,
params.common.fn_train_data,
params.common.sample_start,

View file

@ -0,0 +1,5 @@
set(TARGET imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,380 @@
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <sstream>
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct Stats {
std::vector<float> values;
int ncall = 0;
};
struct StatParams {
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
bool collect_output_weight = false;
};
class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); }
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
void save_imatrix() const;
private:
std::unordered_map<std::string, Stats> m_stats;
StatParams m_params;
std::mutex m_mutex;
int m_last_call = 0;
};
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
std::lock_guard<std::mutex> lock(m_mutex);
auto& e = m_stats[src0->name];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = (const float *)src1->data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix();
}
}
}
void IMatrixCollector::save_imatrix() const {
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries));
for (auto& p : m_stats) {
int len = p.first.size();
out.write((const char*)&len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char*)&nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
}
}
static IMatrixCollector g_collector;
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
g_collector.collect_imatrix(src0, src1);
}
struct results_log_softmax {
double log_softmax;
float logit;
float prob;
};
static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
max_logit = std::max(max_logit, v);
}
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
const float logit = logits[i] - max_logit;
const float exp_logit = expf(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) {
probs[i] /= sum_exp;
}
return probs;
}
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
logit_history[i] = results.logit;
prob_history[i] = results.prob;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return false;
}
std::vector<float> logit_history;
logit_history.resize(tokens.size());
std::vector<float> prob_history;
prob_history.resize(tokens.size());
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
const int first = n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
return true;
}
int main(int argc, char ** argv) {
StatParams sparams;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
args.push_back(argv[iarg]);
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
g_collector.set_parameters(std::move(sparams));
ggml_set_imatrix_collection(ik_collect_imatrix);
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params);
if (!OK) {
return 1;
}
g_collector.save_imatrix();
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -128,6 +128,25 @@ static std::string get_gpu_info() {
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
case SQL: return "sql";
default: GGML_ASSERT(!"invalid output format");
}
}
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_NONE: return "none";
case LLAMA_SPLIT_LAYER: return "layer";
case LLAMA_SPLIT_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode");
}
}
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
@ -137,6 +156,7 @@ struct cmd_params {
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
@ -155,6 +175,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
@ -169,21 +190,22 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@ -306,6 +328,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_NONE;
} else if (m == "layer") {
mode = LLAMA_SPLIT_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_ROW;
} else {
invalid_param = true;
break;
}
modes.push_back(mode);
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
@ -392,6 +436,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@ -410,6 +455,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
@ -419,6 +465,7 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@ -428,6 +475,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
tensor_split == other.tensor_split;
}
@ -446,45 +494,13 @@ struct cmd_params_instance {
}
};
static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
std::vector<cmd_params_instance> instances;
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & nt : params.n_threads) {
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts,
};
instances.push_back(instance);
}
return instances;
}
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
std::vector<cmd_params_instance> instances;
#if 1
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch)
@ -506,6 +522,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
@ -527,6 +544,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
@ -535,24 +553,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
instances.push_back(instance);
}
}
#else
// this ordering separates the prompt and generation tests
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
}
auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
}
for (const auto & n_gen : params.n_gen) {
if (n_gen == 0) {
continue;
}
auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
}
#endif
return instances;
}
@ -577,6 +577,7 @@ struct test {
ggml_type type_k;
ggml_type type_v;
int n_gpu_layers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
@ -598,6 +599,7 @@ struct test {
type_k = inst.type_k;
type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers;
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q;
@ -664,7 +666,8 @@ struct test {
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "main_gpu", "no_kv_offload",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
"mul_mat_q", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
@ -715,7 +718,8 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(mul_mat_q), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -872,6 +876,9 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return "ngl";
}
if (field == "split_mode") {
return "sm";
}
if (field == "n_threads") {
return "threads";
}
@ -912,6 +919,9 @@ struct markdown_printer : public printer {
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
}
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
fields.push_back("split_mode");
}
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q");
}

33
examples/llama.android/.gitignore vendored Normal file
View file

@ -0,0 +1,33 @@
# Gradle files
.gradle/
build/
# Local configuration file (sdk path, etc)
local.properties
# Log/OS Files
*.log
# Android Studio generated files and folders
captures/
.externalNativeBuild/
.cxx/
*.apk
output.json
# IntelliJ
*.iml
.idea/
misc.xml
deploymentTargetDropDown.xml
render.experimental.xml
# Keystore files
*.jks
*.keystore
# Google Services (e.g. APIs or Firebase)
google-services.json
# Android Profiling
*.hprof

View file

1
examples/llama.android/app/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/build

View file

@ -0,0 +1,91 @@
plugins {
id("com.android.application")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
targetSdk = 34
versionCode = 1
versionName = "1.0"
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Workaround for https://github.com/llvm/llvm-project/issues/65820
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
if (project.hasProperty("skip-armeabi-v7a")) {
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
}
}
externalNativeBuild {
cmake {
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
release {
isMinifyEnabled = false
proguardFiles(
getDefaultProguardFile("proguard-android-optimize.txt"),
"proguard-rules.pro"
)
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
buildFeatures {
compose = true
}
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
implementation("androidx.core:core-ktx:1.12.0")
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
implementation("androidx.activity:activity-compose:1.8.2")
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
implementation("androidx.compose.ui:ui")
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
debugImplementation("androidx.compose.ui:ui-tooling")
debugImplementation("androidx.compose.ui:ui-test-manifest")
}

View file

@ -0,0 +1,21 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View file

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:tools="http://schemas.android.com/tools">
<uses-permission android:name="android.permission.INTERNET" />
<application
android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules"
android:fullBackupContent="@xml/backup_rules"
android:icon="@mipmap/ic_launcher"
android:label="@string/app_name"
android:roundIcon="@mipmap/ic_launcher_round"
android:supportsRtl="true"
android:theme="@style/Theme.LlamaAndroid"
>
<activity
android:name=".MainActivity"
android:exported="true"
android:theme="@style/Theme.LlamaAndroid">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

View file

@ -0,0 +1,50 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
# Also provides "common"
FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -0,0 +1,394 @@
#include <android/log.h>
#include <jni.h>
#include <iomanip>
#include <math.h>
#include <string>
#include <unistd.h>
#include "llama.h"
#include "common/common.h"
// Write C++ code here.
//
// Do not forget to dynamically load the C++ library into your application.
//
// For instance,
//
// In MainActivity.java:
// static {
// System.loadLibrary("llama-android");
// }
//
// Or, in MainActivity.kt:
// companion object {
// init {
// System.loadLibrary("llama-android")
// }
// }
#define TAG "llama-android.cpp"
#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
jclass la_int_var;
jmethodID la_int_var_value;
jmethodID la_int_var_inc;
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
LOGi("Loading model from %s", path_to_model);
auto model = llama_load_model_from_file(path_to_model, model_params);
env->ReleaseStringUTFChars(filename, path_to_model);
if (!model) {
LOGe("load_model() failed");
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
return 0;
}
return reinterpret_cast<jlong>(model);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) {
LOGe("new_context(): model cannot be null");
env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
return 0;
}
int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
LOGi("Using %d threads", n_threads);
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
ctx_params.n_threads = n_threads;
ctx_params.n_threads_batch = n_threads;
llama_context * context = llama_new_context_with_model(model, ctx_params);
if (!context) {
LOGe("llama_new_context_with_model() returned null)");
env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
"llama_new_context_with_model() returned null)");
return 0;
}
return reinterpret_cast<jlong>(context);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context));
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
jlong model_pointer,
jlong batch_pointer,
jint pp,
jint tg,
jint pl,
jint nr
) {
auto pp_avg = 0.0;
auto tg_avg = 0.0;
auto pp_std = 0.0;
auto tg_std = 0.0;
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto model = reinterpret_cast<llama_model *>(model_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const int n_ctx = llama_n_ctx(context);
LOGi("n_ctx = %d", n_ctx);
int i, j;
int nri;
for (nri = 0; nri < nr; nri++) {
LOGi("Benchmark prompt processing (pp)");
llama_batch_clear(*batch);
const int n_tokens = pp;
for (i = 0; i < n_tokens; i++) {
llama_batch_add(*batch, 0, i, { 0 }, false);
}
batch->logits[batch->n_tokens - 1] = true;
llama_kv_cache_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
LOGi("llama_decode() failed during prompt processing");
}
const auto t_pp_end = ggml_time_us();
// bench text generation
LOGi("Benchmark text generation (tg)");
llama_kv_cache_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
llama_batch_clear(*batch);
for (j = 0; j < pl; j++) {
llama_batch_add(*batch, 0, i, { j }, true);
}
LOGi("llama_decode() text generation: %d", i);
if (llama_decode(context, *batch) != 0) {
LOGi("llama_decode() failed during text generation");
}
}
const auto t_tg_end = ggml_time_us();
llama_kv_cache_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
const auto speed_pp = double(pp) / t_pp;
const auto speed_tg = double(pl * tg) / t_tg;
pp_avg += speed_pp;
tg_avg += speed_tg;
pp_std += speed_pp * speed_pp;
tg_std += speed_tg * speed_tg;
LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
}
pp_avg /= double(nr);
tg_avg /= double(nr);
if (nr > 1) {
pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
} else {
pp_std = 0;
tg_std = 0;
}
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
const auto backend = "(Android)"; // TODO: What should this be?
std::stringstream result;
result << std::setprecision(2);
result << "| model | size | params | backend | test | t/s |\n";
result << "| --- | --- | --- | --- | --- | --- |\n";
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
return env->NewStringUTF(result.str().c_str());
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
llama_batch *batch = new llama_batch {
0,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
0,
0,
0,
};
if (embd) {
batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
} else {
batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
}
batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
for (int i = 0; i < n_tokens; ++i) {
batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
}
batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
return reinterpret_cast<jlong>(batch);
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
llama_backend_init(numa);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
Java_com_example_llama_Llm_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
jlong batch_pointer,
jstring jtext,
jint n_len
) {
const auto text = env->GetStringUTFChars(jtext, 0);
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto tokens_list = llama_tokenize(context, text, 1);
auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
if (n_kv_req > n_ctx) {
LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
}
for (auto id : tokens_list) {
LOGi("%s", llama_token_to_piece(context, id).c_str());
}
llama_batch_clear(*batch);
// evaluate the initial prompt
for (auto i = 0; i < tokens_list.size(); i++) {
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
}
// llama_decode will output logits only for the last token of the prompt
batch->logits[batch->n_tokens - 1] = true;
if (llama_decode(context, *batch) != 0) {
LOGe("llama_decode() failed");
}
env->ReleaseStringUTFChars(jtext, text);
return batch->n_tokens;
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
jlong batch_pointer,
jint n_len,
jobject intvar_ncur
) {
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto model = llama_get_model(context);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
auto n_vocab = llama_n_vocab(model);
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
return env->NewStringUTF("");
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
auto new_token = env->NewStringUTF(new_token_chars.c_str());
llama_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
if (llama_decode(context, *batch) != 0) {
LOGe("llama_decode() returned null");
}
return new_token;
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View file

@ -0,0 +1,119 @@
package com.example.llama
import android.app.DownloadManager
import android.net.Uri
import android.util.Log
import androidx.compose.material3.Button
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableDoubleStateOf
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
import androidx.compose.runtime.rememberCoroutineScope
import androidx.compose.runtime.setValue
import androidx.core.database.getLongOrNull
import androidx.core.net.toUri
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch
import java.io.File
data class Downloadable(val name: String, val source: Uri, val destination: File) {
companion object {
@JvmStatic
private val tag: String? = this::class.qualifiedName
sealed interface State
data object Ready: State
data class Downloading(val id: Long): State
data class Downloaded(val downloadable: Downloadable): State
data class Error(val message: String): State
@JvmStatic
@Composable
fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
var status: State by remember {
mutableStateOf(
if (item.destination.exists()) Downloaded(item)
else Ready
)
}
var progress by remember { mutableDoubleStateOf(0.0) }
val coroutineScope = rememberCoroutineScope()
suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
while (true) {
val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
if (cursor == null) {
Log.e(tag, "dm.query() returned null")
return Error("dm.query() returned null")
}
if (!cursor.moveToFirst() || cursor.count < 1) {
cursor.close()
Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
return Ready
}
val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
val sofar = cursor.getLongOrNull(pix) ?: 0
val total = cursor.getLongOrNull(tix) ?: 1
cursor.close()
if (sofar == total) {
return Downloaded(item)
}
progress = (sofar * 1.0) / total
delay(1000L)
}
}
fun onClick() {
when (val s = status) {
is Downloaded -> {
viewModel.load(item.destination.path)
}
is Downloading -> {
coroutineScope.launch {
status = waitForDownload(s, item)
}
}
else -> {
item.destination.delete()
val request = DownloadManager.Request(item.source).apply {
setTitle("Downloading model")
setDescription("Downloading model: ${item.name}")
setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
setDestinationUri(item.destination.toUri())
}
viewModel.log("Saving ${item.name} to ${item.destination.path}")
Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
val id = dm.enqueue(request)
status = Downloading(id)
onClick()
}
}
}
Button(onClick = { onClick() }, enabled = status !is Downloading) {
when (status) {
is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
is Downloaded -> Text("Load ${item.name}")
is Ready -> Text("Download ${item.name}")
is Error -> Text("Download ${item.name}")
}
}
}
}
}

View file

@ -0,0 +1,172 @@
package com.example.llama
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
import kotlinx.coroutines.asCoroutineDispatcher
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.flowOn
import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
class Llm {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
thread(start = false, name = "Llm-RunLoop") {
Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
// No-op if called more than once.
System.loadLibrary("llama-android")
// Set llama log handler to Android
log_to_android()
backend_init(false)
Log.d(tag, system_info())
it.run()
}.apply {
uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
Log.e(tag, "Unhandled exception", exception)
}
}
}.asCoroutineDispatcher()
private val nlen: Int = 64
private external fun log_to_android()
private external fun load_model(filename: String): Long
private external fun free_model(model: Long)
private external fun new_context(model: Long): Long
private external fun free_context(context: Long)
private external fun backend_init(numa: Boolean)
private external fun backend_free()
private external fun free_batch(batch: Long)
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
private external fun bench_model(
context: Long,
model: Long,
batch: Long,
pp: Int,
tg: Int,
pl: Int,
nr: Int
): String
private external fun system_info(): String
private external fun completion_init(
context: Long,
batch: Long,
text: String,
nLen: Int
): Int
private external fun completion_loop(
context: Long,
batch: Long,
nLen: Int,
ncur: IntVar
): String
private external fun kv_cache_clear(context: Long)
suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
return withContext(runLoop) {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
Log.d(tag, "bench(): $state")
bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
}
else -> throw IllegalStateException("No model loaded")
}
}
}
suspend fun load(pathToModel: String) {
withContext(runLoop) {
when (threadLocalState.get()) {
is State.Idle -> {
val model = load_model(pathToModel)
if (model == 0L) throw IllegalStateException("load_model() failed")
val context = new_context(model)
if (context == 0L) throw IllegalStateException("new_context() failed")
val batch = new_batch(512, 0, 1)
if (batch == 0L) throw IllegalStateException("new_batch() failed")
Log.i(tag, "Loaded model $pathToModel")
threadLocalState.set(State.Loaded(model, context, batch))
}
else -> throw IllegalStateException("Model already loaded")
}
}
}
fun send(message: String): Flow<String> = flow {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
if (str.isEmpty()) {
break
}
emit(str)
}
kv_cache_clear(state.context)
}
else -> {}
}
}.flowOn(runLoop)
/**
* Unloads the model and frees resources.
*
* This is a no-op if there's no model loaded.
*/
suspend fun unload() {
withContext(runLoop) {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
free_context(state.context)
free_model(state.model)
free_batch(state.batch)
threadLocalState.set(State.Idle)
}
else -> {}
}
}
}
companion object {
private class IntVar(value: Int) {
@Volatile
var value: Int = value
private set
fun inc() {
synchronized(this) {
value += 1
}
}
}
private sealed interface State {
data object Idle: State
data class Loaded(val model: Long, val context: Long, val batch: Long): State
}
// Enforce only one instance of Llm.
private val _instance: Llm = Llm()
fun instance(): Llm = _instance
}
}

View file

@ -0,0 +1,154 @@
package com.example.llama
import android.app.ActivityManager
import android.app.DownloadManager
import android.content.ClipData
import android.content.ClipboardManager
import android.net.Uri
import android.os.Bundle
import android.os.StrictMode
import android.os.StrictMode.VmPolicy
import android.text.format.Formatter
import androidx.activity.ComponentActivity
import androidx.activity.compose.setContent
import androidx.activity.viewModels
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.Row
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.padding
import androidx.compose.foundation.lazy.LazyColumn
import androidx.compose.foundation.lazy.items
import androidx.compose.foundation.lazy.rememberLazyListState
import androidx.compose.material3.Button
import androidx.compose.material3.LocalContentColor
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.OutlinedTextField
import androidx.compose.material3.Surface
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.ui.Modifier
import androidx.compose.ui.unit.dp
import androidx.core.content.getSystemService
import com.example.llama.ui.theme.LlamaAndroidTheme
import java.io.File
class MainActivity(
activityManager: ActivityManager? = null,
downloadManager: DownloadManager? = null,
clipboardManager: ClipboardManager? = null,
): ComponentActivity() {
private val tag: String? = this::class.simpleName
private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
private val viewModel: MainViewModel by viewModels()
// Get a MemoryInfo object for the device's current memory status.
private fun availableMemory(): ActivityManager.MemoryInfo {
return ActivityManager.MemoryInfo().also { memoryInfo ->
activityManager.getMemoryInfo(memoryInfo)
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
StrictMode.setVmPolicy(
VmPolicy.Builder(StrictMode.getVmPolicy())
.detectLeakedClosableObjects()
.build()
)
val free = Formatter.formatFileSize(this, availableMemory().availMem)
val total = Formatter.formatFileSize(this, availableMemory().totalMem)
viewModel.log("Current memory: $free / $total")
viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
val extFilesDir = getExternalFilesDir(null)
val models = listOf(
Downloadable(
"Phi-2 7B (Q4_0, 1.6 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
File(extFilesDir, "phi-2-q4_0.gguf"),
),
Downloadable(
"TinyLlama 1.1B (f16, 2.2 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
),
Downloadable(
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
),
)
setContent {
LlamaAndroidTheme {
// A surface container using the 'background' color from the theme
Surface(
modifier = Modifier.fillMaxSize(),
color = MaterialTheme.colorScheme.background
) {
MainCompose(
viewModel,
clipboardManager,
downloadManager,
models,
)
}
}
}
}
}
@Composable
fun MainCompose(
viewModel: MainViewModel,
clipboard: ClipboardManager,
dm: DownloadManager,
models: List<Downloadable>
) {
Column {
val scrollState = rememberLazyListState()
Box(modifier = Modifier.weight(1f)) {
LazyColumn(state = scrollState) {
items(viewModel.messages) {
Text(
it,
style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
modifier = Modifier.padding(16.dp)
)
}
}
}
OutlinedTextField(
value = viewModel.message,
onValueChange = { viewModel.updateMessage(it) },
label = { Text("Message") },
)
Row {
Button({ viewModel.send() }) { Text("Send") }
Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
Button({ viewModel.clear() }) { Text("Clear") }
Button({
viewModel.messages.joinToString("\n").let {
clipboard.setPrimaryClip(ClipData.newPlainText("", it))
}
}) { Text("Copy") }
}
Column {
for (model in models) {
Downloadable.Button(viewModel, dm, model)
}
}
}
}

View file

@ -0,0 +1,104 @@
package com.example.llama
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.setValue
import androidx.lifecycle.ViewModel
import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
}
private val tag: String? = this::class.simpleName
var messages by mutableStateOf(listOf("Initializing..."))
private set
var message by mutableStateOf("")
private set
override fun onCleared() {
super.onCleared()
viewModelScope.launch {
try {
llm.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
}
}
fun send() {
val text = message
message = ""
// Add to messages console.
messages += text
messages += ""
viewModelScope.launch {
llm.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
}
.collect { messages = messages.dropLast(1) + (messages.last() + it) }
}
}
fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llm.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
val warmup = (end - start).toDouble() / NanosPerSecond
messages += "Warm up time: $warmup seconds, please wait..."
if (warmup > 5.0) {
messages += "Warm up took too long, aborting benchmark"
return@launch
}
messages += llm.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
}
}
}
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llm.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)
messages += exc.message!!
}
}
}
fun updateMessage(newMessage: String) {
message = newMessage
}
fun clear() {
messages = listOf()
}
fun log(message: String) {
messages += message
}
}

View file

@ -0,0 +1,11 @@
package com.example.llama.ui.theme
import androidx.compose.ui.graphics.Color
val Purple80 = Color(0xFFD0BCFF)
val PurpleGrey80 = Color(0xFFCCC2DC)
val Pink80 = Color(0xFFEFB8C8)
val Purple40 = Color(0xFF6650a4)
val PurpleGrey40 = Color(0xFF625b71)
val Pink40 = Color(0xFF7D5260)

View file

@ -0,0 +1,70 @@
package com.example.llama.ui.theme
import android.app.Activity
import android.os.Build
import androidx.compose.foundation.isSystemInDarkTheme
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.darkColorScheme
import androidx.compose.material3.dynamicDarkColorScheme
import androidx.compose.material3.dynamicLightColorScheme
import androidx.compose.material3.lightColorScheme
import androidx.compose.runtime.Composable
import androidx.compose.runtime.SideEffect
import androidx.compose.ui.graphics.toArgb
import androidx.compose.ui.platform.LocalContext
import androidx.compose.ui.platform.LocalView
import androidx.core.view.WindowCompat
private val DarkColorScheme = darkColorScheme(
primary = Purple80,
secondary = PurpleGrey80,
tertiary = Pink80
)
private val LightColorScheme = lightColorScheme(
primary = Purple40,
secondary = PurpleGrey40,
tertiary = Pink40
/* Other default colors to override
background = Color(0xFFFFFBFE),
surface = Color(0xFFFFFBFE),
onPrimary = Color.White,
onSecondary = Color.White,
onTertiary = Color.White,
onBackground = Color(0xFF1C1B1F),
onSurface = Color(0xFF1C1B1F),
*/
)
@Composable
fun LlamaAndroidTheme(
darkTheme: Boolean = isSystemInDarkTheme(),
// Dynamic color is available on Android 12+
dynamicColor: Boolean = true,
content: @Composable () -> Unit
) {
val colorScheme = when {
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
val context = LocalContext.current
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
}
darkTheme -> DarkColorScheme
else -> LightColorScheme
}
val view = LocalView.current
if (!view.isInEditMode) {
SideEffect {
val window = (view.context as Activity).window
window.statusBarColor = colorScheme.primary.toArgb()
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
}
}
MaterialTheme(
colorScheme = colorScheme,
typography = Typography,
content = content
)
}

View file

@ -0,0 +1,34 @@
package com.example.llama.ui.theme
import androidx.compose.material3.Typography
import androidx.compose.ui.text.TextStyle
import androidx.compose.ui.text.font.FontFamily
import androidx.compose.ui.text.font.FontWeight
import androidx.compose.ui.unit.sp
// Set of Material typography styles to start with
val Typography = Typography(
bodyLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 16.sp,
lineHeight = 24.sp,
letterSpacing = 0.5.sp
)
/* Other default text styles to override
titleLarge = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Normal,
fontSize = 22.sp,
lineHeight = 28.sp,
letterSpacing = 0.sp
),
labelSmall = TextStyle(
fontFamily = FontFamily.Default,
fontWeight = FontWeight.Medium,
fontSize = 11.sp,
lineHeight = 16.sp,
letterSpacing = 0.5.sp
)
*/
)

View file

@ -0,0 +1,170 @@
<?xml version="1.0" encoding="utf-8"?>
<vector xmlns:android="http://schemas.android.com/apk/res/android"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path
android:fillColor="#3DDC84"
android:pathData="M0,0h108v108h-108z" />
<path
android:fillColor="#00000000"
android:pathData="M9,0L9,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,0L19,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,0L29,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,0L39,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,0L49,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,0L59,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,0L69,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,0L79,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M89,0L89,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M99,0L99,108"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,9L108,9"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,19L108,19"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,29L108,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,39L108,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,49L108,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,59L108,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,69L108,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,79L108,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,89L108,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M0,99L108,99"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,29L89,29"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,39L89,39"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,49L89,49"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,59L89,59"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,69L89,69"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M19,79L89,79"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M29,19L29,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M39,19L39,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M49,19L49,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M59,19L59,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M69,19L69,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
<path
android:fillColor="#00000000"
android:pathData="M79,19L79,89"
android:strokeWidth="0.8"
android:strokeColor="#33FFFFFF" />
</vector>

View file

@ -0,0 +1,30 @@
<vector xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:aapt="http://schemas.android.com/aapt"
android:width="108dp"
android:height="108dp"
android:viewportWidth="108"
android:viewportHeight="108">
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
<aapt:attr name="android:fillColor">
<gradient
android:endX="85.84757"
android:endY="92.4963"
android:startX="42.9492"
android:startY="49.59793"
android:type="linear">
<item
android:color="#44000000"
android:offset="0.0" />
<item
android:color="#00000000"
android:offset="1.0" />
</gradient>
</aapt:attr>
</path>
<path
android:fillColor="#FFFFFF"
android:fillType="nonZero"
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
android:strokeWidth="1"
android:strokeColor="#00000000" />
</vector>

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<background android:drawable="@drawable/ic_launcher_background" />
<foreground android:drawable="@drawable/ic_launcher_foreground" />
<monochrome android:drawable="@drawable/ic_launcher_foreground" />
</adaptive-icon>

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 982 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.6 KiB

View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="purple_200">#FFBB86FC</color>
<color name="purple_500">#FF6200EE</color>
<color name="purple_700">#FF3700B3</color>
<color name="teal_200">#FF03DAC5</color>
<color name="teal_700">#FF018786</color>
<color name="black">#FF000000</color>
<color name="white">#FFFFFFFF</color>
</resources>

View file

@ -0,0 +1,3 @@
<resources>
<string name="app_name">LlamaAndroid</string>
</resources>

View file

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
</resources>

View file

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample backup rules file; uncomment and customize as necessary.
See https://developer.android.com/guide/topics/data/autobackup
for details.
Note: This file is ignored for devices older that API 31
See https://developer.android.com/about/versions/12/backup-restore
-->
<full-backup-content>
<!--
<include domain="sharedpref" path="."/>
<exclude domain="sharedpref" path="device.xml"/>
-->
</full-backup-content>

View file

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="utf-8"?><!--
Sample data extraction rules file; uncomment and customize as necessary.
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
for details.
-->
<data-extraction-rules>
<cloud-backup>
<!-- TODO: Use <include> and <exclude> to control what is backed up.
<include .../>
<exclude .../>
-->
</cloud-backup>
<!--
<device-transfer>
<include .../>
<exclude .../>
</device-transfer>
-->
</data-extraction-rules>

View file

@ -0,0 +1,5 @@
// Top-level build file where you can add configuration options common to all sub-projects/modules.
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
}

View file

@ -0,0 +1,23 @@
# Project-wide Gradle settings.
# IDE (e.g. Android Studio) users:
# Gradle settings configured through the IDE *will override*
# any settings specified in this file.
# For more details on how to configure your build environment visit
# http://www.gradle.org/docs/current/userguide/build_environment.html
# Specifies the JVM arguments used for the daemon process.
# The setting is particularly useful for tweaking memory settings.
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
# When configured, Gradle will run in incubating parallel mode.
# This option should only be used with decoupled projects. More details, visit
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
# org.gradle.parallel=true
# AndroidX package structure to make it clearer which packages are bundled with the
# Android operating system, and which are packaged with your app's APK
# https://developer.android.com/topic/libraries/support-library/androidx-rn
android.useAndroidX=true
# Kotlin code style for this project: "official" or "obsolete":
kotlin.code.style=official
# Enables namespacing of each library's R class so that its R class includes only the
# resources declared in the library itself and none from the library's dependencies,
# thereby reducing the size of the R class for that library
android.nonTransitiveRClass=true

Binary file not shown.

View file

@ -0,0 +1,6 @@
#Thu Dec 21 14:31:09 AEDT 2023
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

185
examples/llama.android/gradlew vendored Executable file
View file

@ -0,0 +1,185 @@
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"

View file

@ -0,0 +1,17 @@
pluginManagement {
repositories {
google()
mavenCentral()
gradlePluginPortal()
}
}
dependencyResolutionManagement {
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
repositories {
google()
mavenCentral()
}
}
rootProject.name = "LlamaAndroid"
include(":app")

View file

@ -1,7 +1,12 @@
# llama.swiftui
# llama.cpp/examples/llama.swiftui
Local inference of llama.cpp on an iPhone.
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
point for more advanced projects.
For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
Video demonstration:
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545

View file

@ -8,6 +8,7 @@
/* Begin PBXBuildFile section */
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
@ -22,6 +23,7 @@
/* Begin PBXFileReference section */
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
@ -119,6 +121,7 @@
7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */,
);
path = UI;
sourceTree = "<group>";
@ -213,6 +216,7 @@
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -345,7 +349,7 @@
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = STLSG3FG8Q;
DEVELOPMENT_TEAM = K5UQJPP73A;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
@ -377,7 +381,7 @@
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_TEAM = STLSG3FG8Q;
DEVELOPMENT_TEAM = K5UQJPP73A;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;

View file

@ -1,9 +1,19 @@
import Foundation
struct Model: Identifiable {
var id = UUID()
var name: String
var url: String
var filename: String
var status: String?
}
@MainActor
class LlamaState: ObservableObject {
@Published var messageLog = ""
@Published var cacheCleared = false
@Published var downloadedModels: [Model] = []
@Published var undownloadedModels: [Model] = []
let NS_PER_S = 1_000_000_000.0
private var llamaContext: LlamaContext?
@ -13,23 +23,102 @@ class LlamaState: ObservableObject {
}
init() {
loadModelsFromDisk()
loadDefaultModels()
}
private func loadModelsFromDisk() {
do {
let documentsURL = getDocumentsDirectory()
let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants])
for modelURL in modelURLs {
let modelName = modelURL.deletingPathExtension().lastPathComponent
downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded"))
}
} catch {
print("Error loading models from disk: \(error)")
}
}
private func loadDefaultModels() {
do {
try loadModel(modelUrl: defaultModelUrl)
} catch {
messageLog += "Error!\n"
}
for model in defaultModels {
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
if FileManager.default.fileExists(atPath: fileURL.path) {
} else {
var undownloadedModel = model
undownloadedModel.status = "download"
undownloadedModels.append(undownloadedModel)
}
}
}
func getDocumentsDirectory() -> URL {
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
return paths[0]
}
private let defaultModels: [Model] = [
Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
Model(
name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)",
url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
),
Model(
name: "TinyLlama-1.1B (F16, 2.2 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
filename: "tinyllama-1.1b-f16.gguf", status: "download"
),
Model(
name: "Phi-2.7B (Q4_0, 1.6 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
filename: "phi-2-q4_0.gguf", status: "download"
),
Model(
name: "Phi-2.7B (Q8_0, 2.8 GiB)",
url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
filename: "phi-2-q8_0.gguf", status: "download"
),
Model(
name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download"
),
Model(
name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)",
url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true",
filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download"
)
]
func loadModel(modelUrl: URL?) throws {
if let modelUrl {
messageLog += "Loading model...\n"
llamaContext = try LlamaContext.create_context(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
// Assuming that the model is successfully loaded, update the downloaded models
updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded")
} else {
messageLog += "Load a model from the list below\n"
}
}
private func updateDownloadedModels(modelName: String, status: String) {
undownloadedModels.removeAll { $0.name == modelName }
}
func complete(text: String) async {
guard let llamaContext else {
return

View file

@ -2,115 +2,57 @@ import SwiftUI
struct ContentView: View {
@StateObject var llamaState = LlamaState()
@State private var multiLineText = ""
private static func cleanupModelCaches() {
// Delete all models (*.gguf)
let fileManager = FileManager.default
let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
do {
let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
for fileURL in fileURLs {
if fileURL.pathExtension == "gguf" {
try fileManager.removeItem(at: fileURL)
}
}
} catch {
print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
}
}
@State private var showingHelp = false // To track if Help Sheet should be shown
var body: some View {
VStack {
ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
NavigationView {
VStack {
ScrollView(.vertical, showsIndicators: true) {
Text(llamaState.messageLog)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
}
TextEditor(text: $multiLineText)
.frame(height: 80)
.padding()
.border(Color.gray, width: 0.5)
HStack {
Button("Send") {
sendText()
}
Button("Bench") {
bench()
}
Button("Clear") {
clear()
}
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
}
.buttonStyle(.bordered)
.padding()
.onTapGesture {
UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
}
}
TextEditor(text: $multiLineText)
.frame(height: 80)
NavigationLink(destination: DrawerView(llamaState: llamaState)) {
Text("View Models")
}
.padding()
.border(Color.gray, width: 0.5)
HStack {
Button("Send") {
sendText()
}
Button("Bench") {
bench()
}
Button("Clear") {
clear()
}
Button("Copy") {
UIPasteboard.general.string = llamaState.messageLog
}
}.buttonStyle(.bordered)
VStack(alignment: .leading) {
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
)
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
)
DownloadButton(
llamaState: llamaState,
modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
filename: "tinyllama-1.1b-f16.gguf"
)
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
filename: "phi-2-q4_0.gguf"
)
DownloadButton(
llamaState: llamaState,
modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
filename: "phi-2-q8_0.gguf"
)
DownloadButton(
llamaState: llamaState,
modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
filename: "mistral-7b-v0.1.Q4_0.gguf"
)
Button("Clear downloaded models") {
ContentView.cleanupModelCaches()
llamaState.cacheCleared = true
}
LoadCustomButton(llamaState: llamaState)
}
.padding(.top, 4)
.font(.system(size: 12))
.frame(maxWidth: .infinity, alignment: .leading)
.padding()
.navigationBarTitle("Model Settings", displayMode: .inline)
}
.padding()
}
func sendText() {
@ -131,8 +73,73 @@ struct ContentView: View {
await llamaState.clear()
}
}
struct DrawerView: View {
@ObservedObject var llamaState: LlamaState
@State private var showingHelp = false
func delete(at offsets: IndexSet) {
offsets.forEach { offset in
let model = llamaState.downloadedModels[offset]
let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
do {
try FileManager.default.removeItem(at: fileURL)
} catch {
print("Error deleting file: \(error)")
}
}
// Remove models from downloadedModels array
llamaState.downloadedModels.remove(atOffsets: offsets)
}
func getDocumentsDirectory() -> URL {
let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
return paths[0]
}
var body: some View {
List {
Section(header: Text("Download Models From Hugging Face")) {
HStack {
InputButton(llamaState: llamaState)
}
}
Section(header: Text("Downloaded Models")) {
ForEach(llamaState.downloadedModels) { model in
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
}
.onDelete(perform: delete)
}
Section(header: Text("Default Models")) {
ForEach(llamaState.undownloadedModels) { model in
DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
}
}
}
.listStyle(GroupedListStyle())
.navigationBarTitle("Model Settings", displayMode: .inline).toolbar {
ToolbarItem(placement: .navigationBarTrailing) {
Button("Help") {
showingHelp = true
}
}
}.sheet(isPresented: $showingHelp) { // Sheet for help modal
VStack(alignment: .leading) {
VStack(alignment: .leading) {
Text("1. Make sure the model is in GGUF Format")
.padding()
Text("2. Copy the download link of the quantized model")
.padding()
}
Spacer()
}
}
}
}
}
//#Preview {
// ContentView()
//}
struct ContentView_Previews: PreviewProvider {
static var previews: some View {
ContentView()
}
}

View file

@ -53,6 +53,8 @@ struct DownloadButton: View {
llamaState.cacheCleared = false
let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded")
llamaState.downloadedModels.append(model)
status = "downloaded"
}
} catch let err {

View file

@ -0,0 +1,131 @@
import SwiftUI
struct InputButton: View {
@ObservedObject var llamaState: LlamaState
@State private var inputLink: String = ""
@State private var status: String = "download"
@State private var filename: String = ""
@State private var downloadTask: URLSessionDownloadTask?
@State private var progress = 0.0
@State private var observation: NSKeyValueObservation?
private static func extractModelInfo(from link: String) -> (modelName: String, filename: String)? {
guard let url = URL(string: link),
let lastPathComponent = url.lastPathComponent.components(separatedBy: ".").first,
let modelName = lastPathComponent.components(separatedBy: "-").dropLast().joined(separator: "-").removingPercentEncoding,
let filename = lastPathComponent.removingPercentEncoding else {
return nil
}
return (modelName, filename)
}
private static func getFileURL(filename: String) -> URL {
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
}
private func download() {
guard let extractedInfo = InputButton.extractModelInfo(from: inputLink) else {
// Handle invalid link or extraction failure
return
}
let (modelName, filename) = extractedInfo
self.filename = filename // Set the state variable
status = "downloading"
print("Downloading model \(modelName) from \(inputLink)")
guard let url = URL(string: inputLink) else { return }
let fileURL = InputButton.getFileURL(filename: filename)
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
if let error = error {
print("Error: \(error.localizedDescription)")
return
}
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
print("Server error!")
return
}
do {
if let temporaryURL = temporaryURL {
try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
print("Writing to \(filename) completed")
llamaState.cacheCleared = false
let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded")
llamaState.downloadedModels.append(model)
status = "downloaded"
}
} catch let err {
print("Error: \(err.localizedDescription)")
}
}
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
self.progress = progress.fractionCompleted
}
downloadTask?.resume()
}
var body: some View {
VStack {
HStack {
TextField("Paste Quantized Download Link", text: $inputLink)
.textFieldStyle(RoundedBorderTextFieldStyle())
Button(action: {
downloadTask?.cancel()
status = "download"
}) {
Text("Cancel")
}
}
if status == "download" {
Button(action: download) {
Text("Download Custom Model")
}
} else if status == "downloading" {
Button(action: {
downloadTask?.cancel()
status = "download"
}) {
Text("Downloading \(Int(progress * 100))%")
}
} else if status == "downloaded" {
Button(action: {
let fileURL = InputButton.getFileURL(filename: self.filename)
if !FileManager.default.fileExists(atPath: fileURL.path) {
download()
return
}
do {
try llamaState.loadModel(modelUrl: fileURL)
} catch let err {
print("Error: \(err.localizedDescription)")
}
}) {
Text("Load Custom Model")
}
} else {
Text("Unknown status")
}
}
.onDisappear() {
downloadTask?.cancel()
}
.onChange(of: llamaState.cacheCleared) { newValue in
if newValue {
downloadTask?.cancel()
let fileURL = InputButton.getFileURL(filename: self.filename)
status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
}
}
}
}

View file

@ -126,24 +126,7 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
}
static std::string get_ftype(int ftype) {
switch (ftype) {
case 0:
return "f32";
case 1:
return "f16";
case 2:
return "q4_0";
case 3:
return "q4_1";
case 6:
return "q5_0";
case 7:
return "q5_1";
case 8:
return "q8_0";
default:
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
}
return ggml_type_name(static_cast<ggml_type>(ftype));
}
//
@ -533,6 +516,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
buffer_size += n_tensors * 128 /* CLIP PADDING */;
clip_ctx * new_clip = new clip_ctx;
#ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__);
@ -543,6 +527,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("%s: CLIP using Metal backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
printf("%s: CLIP using CPU backend\n", __func__);
@ -931,26 +916,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
case 2:
type = GGML_TYPE_Q4_0;
break;
case 3:
type = GGML_TYPE_Q4_1;
break;
case 6:
type = GGML_TYPE_Q5_0;
break;
case 7:
type = GGML_TYPE_Q5_1;
break;
case 8:
type = GGML_TYPE_Q8_0;
break;
default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false;
};
assert(itype < GGML_TYPE_COUNT);
type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_model_load(fname_inp, 2);
@ -1010,6 +977,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
if (quantize) {
new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
const size_t n_elms = ggml_nelements(cur);
float * f32_data;
@ -1054,6 +1025,21 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
case GGML_TYPE_Q8_0: {
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q2_K: {
new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q3_K: {
new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_K: {
new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_K: {
new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q6_K: {
new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
} break;
default: {
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
return false;

View file

@ -243,6 +243,9 @@ int main(int argc, char ** argv) {
}
auto image_embed = load_image(ctx_llava, &params);
if (!image_embed) {
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);

View file

@ -477,6 +477,7 @@ int main(int argc, char ** argv) {
bool is_antiprompt = false;
bool input_echo = true;
bool display = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
int n_past = 0;
@ -491,6 +492,7 @@ int main(int argc, char ** argv) {
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
display = params.display_prompt;
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
@ -500,7 +502,7 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
@ -650,6 +652,10 @@ int main(int argc, char ** argv) {
n_past += n_eval;
LOG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
}
}
if (!embd.empty() && !path_session.empty()) {
@ -703,7 +709,7 @@ int main(int argc, char ** argv) {
}
// display text
if (input_echo) {
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
@ -720,6 +726,7 @@ int main(int argc, char ** argv) {
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
display = true;
}
// if not currently processing queued inputs;
@ -792,6 +799,7 @@ int main(int argc, char ** argv) {
// color user input only
console::set_display(console::user_input);
display = params.display_prompt;
std::string line;
bool another_line = true;
@ -802,6 +810,7 @@ int main(int argc, char ** argv) {
// done taking input, reset color
console::set_display(console::reset);
display = true;
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back

View file

@ -1,4 +0,0 @@
set(TEST_TARGET metal)
add_executable(${TEST_TARGET} metal.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)

View file

@ -1,103 +0,0 @@
// Evaluate a statically exported ggml computation graph with Metal
//
// - First, export a LLaMA graph:
//
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
//
// - Run this tool to evaluate the exported graph:
//
// $ ./bin/metal llama.ggml
//
// The purpose of this tool is mostly for debugging and demonstration purposes.
// The main limitation of exporting computation graphs is that their sizes are static which often
// can be a problem for real-world applications.
//
#include "ggml.h"
#include "ggml-metal.h"
#include <cstdio>
#include <cstring>
#include <cstdlib>
int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 2) {
fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
return -1;
}
const char * fname_cgraph = argv[1];
// load the compute graph
struct ggml_context * ctx_data = NULL;
struct ggml_context * ctx_eval = NULL;
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init(1);
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
// main
{
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
*(int32_t *) input->data = 1; // BOS
ggml_metal_set_tensor(ctx_metal, input);
// warmup
ggml_metal_graph_compute(ctx_metal, gf);
const int n_iter = 16;
const int64_t t0 = ggml_time_us();
// the actual inference happens here
for (int i = 0; i < n_iter; ++i) {
ggml_metal_graph_compute(ctx_metal, gf);
}
const int64_t t1 = ggml_time_us();
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
}
// debug output
{
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);
float * ptr = (float *) ggml_get_data(logits);
printf("logits: ");
for (int i = 0; i < 10; i++) {
printf("%8.4f ", ptr[i]);
}
printf("\n");
int imax = 0;
double sum = 0.0;
double vmax = -1e9;
for (int i = 0; i < 32000; i++) {
sum += (double) ptr[i];
if (ptr[i] > vmax) {
vmax = ptr[i];
imax = i;
}
}
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
}
ggml_metal_free(ctx_metal);
ggml_free(ctx_data);
ggml_free(ctx_eval);
return 0;
}

View file

@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
size_t n_tokens = tokens.size() - i_chunk * n_batch;
n_tokens = std::min(n_tokens, size_t(n_batch));
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return {};

View file

@ -0,0 +1,255 @@
# Function calling example using pydantic models.
import datetime
import json
from enum import Enum
from typing import Union, Optional
import requests
from pydantic import BaseModel, Field
import importlib
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
# Function to get completion on the llama.cpp server with grammar.
def create_completion(prompt, grammar):
headers = {"Content-Type": "application/json"}
data = {"prompt": prompt, "grammar": grammar}
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
data = response.json()
print(data["content"])
return data["content"]
# A function for the agent to send a message to the user.
class SendMessageToUser(BaseModel):
"""
Send a message to the User.
"""
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
message: str = Field(..., description="Message you want to send to the user.")
def run(self):
print(self.message)
# Enum for the calculator function.
class MathOperation(Enum):
ADD = "add"
SUBTRACT = "subtract"
MULTIPLY = "multiply"
DIVIDE = "divide"
# Very simple calculator tool for the agent.
class Calculator(BaseModel):
"""
Perform a math operation on two numbers.
"""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
def run(self):
if self.operation == MathOperation.ADD:
return self.number_one + self.number_two
elif self.operation == MathOperation.SUBTRACT:
return self.number_one - self.number_two
elif self.operation == MathOperation.MULTIPLY:
return self.number_one * self.number_two
elif self.operation == MathOperation.DIVIDE:
return self.number_one / self.number_two
else:
raise ValueError("Unknown operation.")
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
# pydantic_model_list is the list of pydanitc models
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
# outer_object_content is the name of outer object content.
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
print(gbnf_grammar)
print(documentation)
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
# This should output something like this:
# {
# "function": "calculator",
# "function_parameters": {
# "number_one": 42,
# "operation": "multiply",
# "number_two": 42
# }
# }
function_dictionary = json.loads(text)
if function_dictionary["function"] == "calculator":
function_parameters = {**function_dictionary["function_parameters"]}
print(Calculator(**function_parameters).run())
# This should output: 1764
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
class Category(Enum):
"""
The category of the book.
"""
Fiction = "Fiction"
NonFiction = "Non-Fiction"
class Book(BaseModel):
"""
Represents an entry about a book.
"""
title: str = Field(..., description="Title of the book.")
author: str = Field(..., description="Author of the book.")
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
keywords: list[str] = Field(..., description="A list of keywords.")
category: Category = Field(..., description="Category of the book.")
summary: str = Field(..., description="Summary of the book.")
# We need no additional parameters other than our list of pydantic models.
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text)
print(Book(**json_data))
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
def get_current_datetime(output_format: Optional[str] = None):
"""
Get the current date and time in the given format.
Args:
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
"""
if output_format is None:
output_format = '%Y-%m-%d %H:%M:%S'
return datetime.datetime.now().strftime(output_format)
# Enum for the calculator tool.
class MathOperation(Enum):
ADD = "add"
SUBTRACT = "subtract"
MULTIPLY = "multiply"
DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
class Calculator(BaseModel):
"""
Perform a math operation on two numbers.
"""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
def run(self):
if self.operation == MathOperation.ADD:
return self.number_one + self.number_two
elif self.operation == MathOperation.SUBTRACT:
return self.number_one - self.number_two
elif self.operation == MathOperation.MULTIPLY:
return self.number_one * self.number_two
elif self.operation == MathOperation.DIVIDE:
return self.number_one / self.number_two
else:
raise ValueError("Unknown operation.")
# Example function to get the weather
def get_current_weather(location, unit):
"""Get the current weather in a given location"""
if "London" in location:
return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
elif "New York" in location:
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
elif "North Pole" in location:
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
else:
return json.dumps({"location": location, "temperature": "unknown"})
# Here is a function definition in OpenAI style
current_weather_tool = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
# Convert OpenAI function definition into pydantic model
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert normal Python function to a pydantic model
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tool_list, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text)
print(json_data)
# Should output something like this:
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
for call in json_data:
if call["function"] == "Calculator":
print(Calculator(**call["params"]).run())
elif call["function"] == "get_current_datetime":
print(current_datetime_model(**call["params"]).run())
elif call["function"] == "get_current_weather":
print(current_weather_tool_model(**call["params"]).run())
# Should output something like this:
# 2024-01-14 13:36:06
# {"location": "London", "temperature": "42", "unit": "celsius"}
# 1764

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,10 @@
#include <cstring>
#include <vector>
#include <string>
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <algorithm>
struct quant_option {
std::string name;
@ -17,7 +21,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
@ -71,10 +78,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") {
@ -82,11 +93,93 @@ static void usage(const char * executable) {
} else {
printf(" ");
}
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
}
exit(1);
}
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
return;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
return;
}
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
return;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {};
return;
}
e.resize(nval);
in.read((char*)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return;
}
if (ncall > 0) {
for (auto& v : e) v /= ncall;
}
}
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
}
static void prepare_imatrix(const std::string& imatrix_file,
const std::vector<std::string>& included_weights,
const std::vector<std::string>& excluded_weights,
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data);
}
if (imatrix_data.empty()) {
return;
}
if (!excluded_weights.empty()) {
for (auto& name : excluded_weights) {
for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
auto pos = it->first.find(name);
if (pos != std::string::npos) it = imatrix_data.erase(it);
else ++it;
}
}
}
if (!included_weights.empty()) {
std::unordered_map<std::string, std::vector<float>> tmp;
for (auto& name : included_weights) {
for (auto& e : imatrix_data) {
auto pos = e.first.find(name);
if (pos != std::string::npos) {
tmp.emplace(std::move(e));
}
}
}
imatrix_data = std::move(tmp);
}
if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
}
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@ -95,6 +188,8 @@ int main(int argc, char ** argv) {
llama_model_quantize_params params = llama_model_quantize_default_params();
int arg_idx = 1;
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -103,14 +198,42 @@ int main(int argc, char ** argv) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
params.pure = true;
} else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
if (arg_idx < argc-1) {
imatrix_file = argv[++arg_idx];
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
if (arg_idx < argc-1) {
included_weights.push_back(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
if (arg_idx < argc-1) {
excluded_weights.push_back(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else {
usage(argv[0]);
}
}
if (argc - arg_idx < 2) {
printf("%s: bad arguments\n", argv[0]);
usage(argv[0]);
}
if (!included_weights.empty() && !excluded_weights.empty()) {
usage(argv[0]);
}
std::unordered_map<std::string, std::vector<float>> imatrix_data;
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
}
llama_backend_init(false);
@ -162,6 +285,13 @@ int main(int argc, char ** argv) {
}
}
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
fprintf(stderr, "\n===============================================================================================\n");
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "===============================================================================================\n\n\n");
return 1;
}
print_build_info();
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());

View file

@ -45,13 +45,13 @@ int main(int argc, char ** argv) {
// save state (rng, logits, embedding and kv_cache) to file
{
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
const size_t written = llama_copy_state_data(ctx, state_mem.data());
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
fclose(fp_write);
}
FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write);
fclose(fp_write);
fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
}
// save state (last tokens)
@ -100,18 +100,17 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
FILE * fp_read = fopen("dump_state.bin", "rb");
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
if (ret != state_mem.size()) {
if (read != llama_set_state_data(ctx2, state_mem.data())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
llama_set_state_data(ctx2, state_mem.data());
fclose(fp_read);
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
}
// restore state (last tokens)

View file

@ -23,6 +23,8 @@ Command line options:
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
@ -109,6 +111,10 @@ node index.js
```
## API Endpoints
- **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@ -174,35 +180,44 @@ node index.js
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
*Result JSON:*
### Result JSON:
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
`content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
`stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
`generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
```
{
"content": "<the token selected by the model>",
"probs": [
{
"prob": float,
"tok_str": "<most likely token>"
},
{
"prob": float,
"tok_str": "<second most likely tonen>"
},
...
]
},
```
Notice that each `probs` is an array of length `n_probs`.
`model`: The path to the model loaded with `-m`
`prompt`: The provided `prompt`
`stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
`stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
`stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
`stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
`timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
`tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
`tokens_evaluated`: Number of tokens evaluated in total from the prompt
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
- `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text.

View file

@ -26,6 +26,7 @@
#include <mutex>
#include <chrono>
#include <condition_variable>
#include <atomic>
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
@ -38,7 +39,7 @@ using json = nlohmann::json;
struct server_params
{
std::string hostname = "127.0.0.1";
std::string api_key;
std::vector<std::string> api_keys;
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
@ -146,9 +147,15 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
// parallel
//
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum task_type {
COMPLETION_TASK,
CANCEL_TASK
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
};
struct task_server {
@ -1173,8 +1180,9 @@ struct llama_server_context
return slot.images.size() > 0;
}
void send_error(task_server& task, std::string error)
void send_error(task_server& task, const std::string &error)
{
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
std::unique_lock<std::mutex> lock(mutex_results);
task_result res;
res.id = task.id;
@ -1343,14 +1351,17 @@ struct llama_server_context
res.result_json["model"] = slot.oaicompat_model;
}
queue_results.push_back(res);
condition_results.notify_all();
// done with results, unlock
lock.unlock();
// parent multitask, if any, needs to be updated
if (slot.multitask_id != -1)
{
update_multi_task(slot.multitask_id, slot.task_id, res);
}
queue_results.push_back(res);
condition_results.notify_all();
}
void send_embedding(llama_client_slot &slot)
@ -1395,11 +1406,11 @@ struct llama_server_context
task.data = std::move(data);
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK;
task.type = TASK_TYPE_COMPLETION;
task.multitask_id = multitask_id;
// when a completion task's prompt array is not a singleton, we split it into multiple requests
if (task.data.at("prompt").size() > 1)
if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
{
lock.unlock(); // entering new func scope
return split_multiprompt_task(task);
@ -1517,7 +1528,7 @@ struct llama_server_context
std::unique_lock<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.type = CANCEL_TASK;
task.type = TASK_TYPE_CANCEL;
task.target_id = task_id;
queue_tasks.push_back(task);
condition_tasks.notify_one();
@ -1553,26 +1564,36 @@ struct llama_server_context
queue_tasks.erase(queue_tasks.begin());
switch (task.type)
{
case COMPLETION_TASK: {
case TASK_TYPE_COMPLETION: {
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
if (slot == nullptr)
{
LOG_TEE("slot unavailable\n");
// send error result
send_error(task, "slot unavailable");
return;
break;
}
if (task.data.contains("system_prompt"))
{
if (!all_slots_are_idle) {
send_error(task, "system prompt can only be updated when all slots are idle");
break;
}
process_system_prompt_data(task.data["system_prompt"]);
// reset cache_tokens for all slots
for (llama_client_slot &slot : slots)
{
slot.cache_tokens.clear();
}
}
slot->reset();
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
slot->multitask_id = task.multitask_id;
if (!launch_slot_with_data(slot, task.data))
@ -1582,7 +1603,7 @@ struct llama_server_context
break;
}
} break;
case CANCEL_TASK: { // release slot linked with the task id
case TASK_TYPE_CANCEL: { // release slot linked with the task id
for (auto & slot : slots)
{
if (slot.task_id == task.target_id)
@ -1596,6 +1617,7 @@ struct llama_server_context
}
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
std::vector<task_result> agg_results;
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
@ -1616,8 +1638,9 @@ struct llama_server_context
}
aggregate_result.result_json = json{ "results", result_jsons };
std::lock_guard<std::mutex> lock(mutex_results);
queue_results.push_back(aggregate_result);
agg_results.push_back(aggregate_result);
condition_results.notify_all();
queue_iterator = queue_multitasks.erase(queue_iterator);
@ -1627,14 +1650,20 @@ struct llama_server_context
++queue_iterator;
}
}
// done with tasks, unlock
lock.unlock();
// copy aggregate results of complete multi-tasks to the results queue
std::lock_guard<std::mutex> lock_results(mutex_results);
queue_results.insert(queue_results.end(), agg_results.begin(), agg_results.end());
}
bool update_slots() {
// attend tasks
process_tasks();
// update the system prompt wait until all slots are idle state
if (system_need_update && all_slots_are_idle)
if (system_need_update)
{
LOG_TEE("updating system prompt\n");
update_system_prompt();
@ -1724,7 +1753,8 @@ struct llama_server_context
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
// empty prompt passed -> release the slot and send empty response
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
// note: infill mode allows empty prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
{
slot.release();
slot.print_timings();
@ -1827,7 +1857,7 @@ struct llama_server_context
slot.cache_tokens = prompt_tokens;
if (slot.n_past == slot.num_prompt_tokens)
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
@ -1997,12 +2027,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
printf(" how to split the model across multiple GPUs, one of:\n");
printf(" - none: use one GPU only\n");
printf(" - layer (default): split layers and KV across GPUs\n");
printf(" - row: split rows across GPUs\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
printf(" -nommq, --no-mul-mat-q\n");
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row)\n");
#endif
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
@ -2014,6 +2047,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
@ -2074,7 +2108,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
sparams.api_key = argv[i];
sparams.api_keys.push_back(argv[i]);
}
else if (arg == "--api-key-file")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
std::ifstream key_file(argv[i]);
if (!key_file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::string key;
while (std::getline(key_file, key)) {
if (key.size() > 0) {
sparams.api_keys.push_back(key);
}
}
key_file.close();
}
else if (arg == "--timeout" || arg == "-to")
{
@ -2223,6 +2278,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
"See main README.md for information on enabling GPU BLAS support",
{{"n_gpu_layers", params.n_gpu_layers}});
#endif
}
else if (arg == "--split-mode" || arg == "-sm")
{
if (++i >= argc) {
invalid_param = true;
break;
}
std::string arg_next = argv[i];
if (arg_next == "none")
{
params.split_mode = LLAMA_SPLIT_NONE;
}
else if (arg_next == "layer")
{
params.split_mode = LLAMA_SPLIT_LAYER;
}
else if (arg_next == "row")
{
params.split_mode = LLAMA_SPLIT_ROW;
}
else {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
}
else if (arg == "--tensor-split" || arg == "-ts")
{
@ -2453,7 +2535,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
@ -2509,7 +2590,7 @@ json oaicompat_completion_params_parse(
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
@ -2581,8 +2662,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage",
json{{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"id", gen_chatcmplid()}};
if (server_verbose) {
@ -2790,20 +2871,131 @@ int main(int argc, char **argv)
{"system_info", llama_print_system_info()},
});
// load the model
if (!llama.load_model(params))
httplib::Server svr;
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
svr.set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*");
});
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
server_state current_state = state.load();
switch(current_state) {
case SERVER_STATE_READY:
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
break;
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");
res.status = 503; // HTTP Service Unavailable
break;
case SERVER_STATE_ERROR:
res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
res.status = 500; // HTTP Internal Server Error
break;
}
});
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
{
const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ];
try
{
std::rethrow_exception(std::move(ep));
}
catch (std::exception &e)
{
snprintf(buf, sizeof(buf), fmt, e.what());
}
catch (...)
{
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
}
res.set_content(buf, "text/plain; charset=utf-8");
res.status = 500;
});
svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
{
if (res.status == 401)
{
res.set_content("Unauthorized", "text/plain; charset=utf-8");
}
if (res.status == 400)
{
res.set_content("Invalid request", "text/plain; charset=utf-8");
}
else if (res.status == 404)
{
res.set_content("File Not Found", "text/plain; charset=utf-8");
res.status = 404;
}
});
// set timeouts and change hostname and port
svr.set_read_timeout (sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
if (!svr.bind_to_port(sparams.hostname, sparams.port))
{
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
return 1;
}
llama.initialize();
// Set the base directory for serving static files
svr.set_base_dir(sparams.public_path);
httplib::Server svr;
// to make it ctrl+clickable:
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
std::unordered_map<std::string, std::string> log_data;
log_data["hostname"] = sparams.hostname;
log_data["port"] = std::to_string(sparams.port);
if (sparams.api_keys.size() == 1) {
log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
} else if (sparams.api_keys.size() > 1) {
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
}
LOG_INFO("HTTP server listening", log_data);
// run the HTTP server in a thread - see comment below
std::thread t([&]()
{
if (!svr.listen_after_bind())
{
state.store(SERVER_STATE_ERROR);
return 1;
}
return 0;
});
// load the model
if (!llama.load_model(params))
{
state.store(SERVER_STATE_ERROR);
return 1;
} else {
llama.initialize();
state.store(SERVER_STATE_READY);
LOG_INFO("model loaded", {});
}
// Middleware for API key validation
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
// If API key is not set, skip validation
if (sparams.api_key.empty()) {
if (sparams.api_keys.empty()) {
return true;
}
@ -2812,7 +3004,7 @@ int main(int argc, char **argv)
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
if (received_api_key == sparams.api_key) {
if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
return true; // API key is valid
}
}
@ -2826,10 +3018,6 @@ int main(int argc, char **argv)
return false;
};
svr.set_default_headers({{"Server", "llama.cpp"},
{"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}});
// this is only called if no index.html is found in the public --path
svr.Get("/", [](const httplib::Request &, httplib::Response &res)
{
@ -2858,9 +3046,9 @@ int main(int argc, char **argv)
return false;
});
svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res)
svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", "*");
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
{ "user_name", llama.name_user.c_str() },
{ "assistant_name", llama.name_assistant.c_str() }
@ -2870,6 +3058,7 @@ int main(int argc, char **argv)
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
@ -2937,10 +3126,9 @@ int main(int argc, char **argv)
}
});
svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
std::time_t t = std::time(0);
json models = {
@ -2958,9 +3146,11 @@ int main(int argc, char **argv)
res.set_content(models.dump(), "application/json; charset=utf-8");
});
// TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
@ -3034,6 +3224,7 @@ int main(int argc, char **argv)
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) {
return;
}
@ -3106,6 +3297,7 @@ int main(int argc, char **argv)
svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
if (body.count("content") != 0)
@ -3118,6 +3310,7 @@ int main(int argc, char **argv)
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::string content;
if (body.count("tokens") != 0)
@ -3132,6 +3325,7 @@ int main(int argc, char **argv)
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
json prompt;
if (body.count("content") != 0)
@ -3157,81 +3351,6 @@ int main(int argc, char **argv)
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
});
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
{
const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ];
try
{
std::rethrow_exception(std::move(ep));
}
catch (std::exception &e)
{
snprintf(buf, sizeof(buf), fmt, e.what());
}
catch (...)
{
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
}
res.set_content(buf, "text/plain; charset=utf-8");
res.status = 500;
});
svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
{
if (res.status == 401)
{
res.set_content("Unauthorized", "text/plain; charset=utf-8");
}
if (res.status == 400)
{
res.set_content("Invalid request", "text/plain; charset=utf-8");
}
else if (res.status == 404)
{
res.set_content("File Not Found", "text/plain; charset=utf-8");
res.status = 404;
}
});
// set timeouts and change hostname and port
svr.set_read_timeout (sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
if (!svr.bind_to_port(sparams.hostname, sparams.port))
{
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
return 1;
}
// Set the base directory for serving static files
svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable:
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
std::unordered_map<std::string, std::string> log_data;
log_data["hostname"] = sparams.hostname;
log_data["port"] = std::to_string(sparams.port);
if (!sparams.api_key.empty()) {
log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
}
LOG_INFO("HTTP server listening", log_data);
// run the HTTP server in a thread - see comment below
std::thread t([&]()
{
if (!svr.listen_after_bind())
{
return 1;
}
return 0;
});
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
// "Bus error: 10" - this is on macOS, it does not crash on Linux
//std::thread t2([&]()

View file

@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) {
params.n_threads = params.n_threads_draft;
}
params.n_threads_batch = params.n_threads_batch_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
{

18
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1701473968,
"narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
"lastModified": 1704982712,
"narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
"rev": "07f6395285469419cf9d078f59b5b49993198c00",
"type": "github"
},
"original": {
@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1703637592,
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
"lastModified": 1705133751,
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
"type": "github"
},
"original": {
@ -37,11 +37,11 @@
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1701253981,
"narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
"lastModified": 1703961334,
"narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
"rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
"type": "github"
},
"original": {

View file

@ -6,28 +6,41 @@
flake-parts.url = "github:hercules-ci/flake-parts";
};
# Optional binary cache
nixConfig = {
extra-substituters = [
# Populated by the CI in ggerganov/llama.cpp
"https://llama-cpp.cachix.org"
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
# This lets one skip building e.g. the CUDA-enabled openmpi.
# TODO: Replace once nix-community obtains an official one.
"https://cuda-maintainers.cachix.org"
];
# Verify these are the same keys as published on
# - https://app.cachix.org/cache/llama-cpp
# - https://app.cachix.org/cache/cuda-maintainers
extra-trusted-public-keys = [
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
];
};
# There's an optional binary cache available. The details are below, but they're commented out.
#
# Why? The terrible experience of being prompted to accept them on every single Nix command run.
# Plus, there are warnings shown about not being a trusted user on a default Nix install
# if you *do* say yes to the prompts.
#
# This experience makes having `nixConfig` in a flake a persistent UX problem.
#
# To make use of the binary cache, please add the relevant settings to your `nix.conf`.
# It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
# option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
# as shown below.
#
# ```
# nixConfig = {
# extra-substituters = [
# # Populated by the CI in ggerganov/llama.cpp
# "https://llama-cpp.cachix.org"
#
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
# # This lets one skip building e.g. the CUDA-enabled openmpi.
# # TODO: Replace once nix-community obtains an official one.
# "https://cuda-maintainers.cachix.org"
# ];
#
# # Verify these are the same keys as published on
# # - https://app.cachix.org/cache/llama-cpp
# # - https://app.cachix.org/cache/cuda-maintainers
# extra-trusted-public-keys = [
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
# ];
# };
# ```
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
#

View file

@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
}
}
AT_PRINTF("block %d\n", best_fit_block);
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
return;
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
void * addr = block->addr;
block->addr = (char*)block->addr + size;
@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
}
}
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
tensor->data = addr;
tensor->buffer = alloc->buffer;
if (!alloc->measure) {
@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
} else {
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
ggml_backend_buffer_reset(alloc->buffer);
}
}
@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
// create a backend buffer to get the correct tensor allocation sizes
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
}
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
// create a backend buffer to get the correct tensor allocation sizes
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
alloc->buffer_owned = true;
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
}
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
#endif
return NULL;
}
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {

View file

@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);

View file

@ -16,13 +16,14 @@ extern "C" {
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i {
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*is_host) (ggml_backend_buffer_type_t buft);
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
@ -34,16 +35,15 @@ extern "C" {
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
void (*free_buffer) (ggml_backend_buffer_t buffer);
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void * (*get_base) (ggml_backend_buffer_t buffer);
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
};
struct ggml_backend_buffer {
@ -51,14 +51,17 @@ extern "C" {
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
enum ggml_backend_buffer_usage usage;
};
ggml_backend_buffer_t ggml_backend_buffer_init(
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size);
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend
@ -67,33 +70,31 @@ extern "C" {
typedef void * ggml_backend_context_t;
struct ggml_backend_i {
const char * (*get_name)(ggml_backend_t backend);
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
void (*free)(ggml_backend_t backend);
void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchroneous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) asynchronous tensor data access
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) asynchroneous tensor copy
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*synchronize)(ggml_backend_t backend);
// (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// compute graph without a plan (async)
bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
};
struct ggml_backend {
@ -102,14 +103,13 @@ extern "C" {
ggml_backend_context_t context;
};
//
// Backend registry
//
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load diff

View file

@ -17,22 +17,31 @@ extern "C" {
//
// buffer type
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
//
// Backend
@ -49,8 +58,8 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -71,13 +80,13 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@ -140,23 +149,24 @@ extern "C" {
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate a graph on the backend scheduler
GGML_API void ggml_backend_sched_graph_compute(
ggml_backend_sched_t sched,
struct ggml_cgraph * graph);
// Allocate and compute graph on the backend scheduler
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
//
// Utils
@ -173,10 +183,10 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

File diff suppressed because it is too large Load diff

View file

@ -18,46 +18,34 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API void ggml_init_cublas(void);
GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API bool ggml_cublas_loaded(void);
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_set_main_device(int main_device);
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
GGML_API void ggml_cuda_free_scratch(void);
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API int ggml_cuda_get_device_count(void);
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
// backend API
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
#ifdef __cplusplus
}

View file

@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_HASHTABLE_FULL ((size_t)-1)
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size);
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted

View file

@ -27,7 +27,6 @@
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
struct ggml_tensor;
struct ggml_cgraph;
@ -36,73 +35,22 @@ struct ggml_cgraph;
extern "C" {
#endif
//
// internal API
// temporary exposed to user-code
//
struct ggml_metal_context;
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
void * ggml_metal_host_malloc(size_t n);
void ggml_metal_host_free (void * data);
// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// try to find operations that can be run concurrently in the graph
// you should run it again if the topology of your graph changes
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
// output the concur_list for ggml_alloc
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//
// backend API
// user-code should use only these functions
//
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks

File diff suppressed because it is too large Load diff

View file

@ -2446,6 +2446,19 @@ typedef struct {
} block_q6_K;
// 210 bytes / block
typedef struct {
half d;
uint16_t qs[QK_K/8];
} block_iq2_xxs;
// 66 bytes / block for QK_K = 256, so 2.0625 bpw
typedef struct {
half d;
uint16_t qs[QK_K/8];
uint8_t scales[QK_K/32];
} block_iq2_xs;
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
//====================================== dot products =========================
void kernel_mul_mv_q2_K_f32_impl(
@ -3468,6 +3481,495 @@ kernel void kernel_mul_mv_q6_K_f32(
kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
}
// ======================= "True" 2-bit
constexpr constant static uint64_t iq2xxs_grid[256] = {
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
};
constexpr constant static uint64_t iq2xs_grid[512] = {
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
};
constexpr constant static uint8_t ksigns_iq2xs[128] = {
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
};
constexpr constant static uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
void kernel_mul_mv_iq2_xxs_f32_impl(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne10,
constant int64_t & ne12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
const uint i12 = im%ne12;
const uint i13 = im/ne12;
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[32];
float sumf[N_DST]={0.f}, all_sum;
const int nb32 = nb * (QK_K / 32);
threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256);
{
int nval = 4;
int pos = (32*sgitg + tiisg)*nval;
for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i];
nval = 2;
pos = (32*sgitg + tiisg)*nval;
for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
threadgroup_barrier(mem_flags::mem_threadgroup);
}
#if QK_K == 256
const int ix = tiisg;
device const float * y4 = y + 32 * ix;
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
for (int i = 0; i < 32; ++i) {
yl[i] = y4[i];
}
const int ibl = ib32 / (QK_K / 32);
const int ib = ib32 % (QK_K / 32);
device const block_iq2_xxs * xr = x + ibl;
device const uint16_t * q2 = xr->qs + 4 * ib;
device const half * dh = &xr->d;
for (int row = 0; row < N_DST; row++) {
const float db = dh[0];
device const uint8_t * aux8 = (device const uint8_t *)q2;
const uint32_t aux32 = q2[2] | (q2[3] << 16);
const float d = db * (0.5f + (aux32 >> 28));
float sum = 0;
for (int l = 0; l < 4; ++l) {
const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]);
const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
for (int j = 0; j < 8; ++j) {
sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
}
}
sumf[row] += d * sum;
dh += nb*sizeof(block_iq2_xxs)/2;
q2 += nb*sizeof(block_iq2_xxs)/2;
}
y4 += 32 * 32;
}
#else
// TODO
#endif
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
}
}
}
[[host_name("kernel_mul_mv_iq2_xxs_f32")]]
kernel void kernel_mul_mv_iq2_xxs_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
}
void kernel_mul_mv_iq2_xs_f32_impl(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne10,
constant int64_t & ne12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
const uint i12 = im%ne12;
const uint i13 = im/ne12;
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
float yl[32];
float sumf[N_DST]={0.f}, all_sum;
const int nb32 = nb * (QK_K / 32);
threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 512);
{
int nval = 8;
int pos = (32*sgitg + tiisg)*nval;
for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
nval = 2;
pos = (32*sgitg + tiisg)*nval;
for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
threadgroup_barrier(mem_flags::mem_threadgroup);
}
#if QK_K == 256
const int ix = tiisg;
device const float * y4 = y + 32 * ix;
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
for (int i = 0; i < 32; ++i) {
yl[i] = y4[i];
}
const int ibl = ib32 / (QK_K / 32);
const int ib = ib32 % (QK_K / 32);
device const block_iq2_xs * xr = x + ibl;
device const uint16_t * q2 = xr->qs + 4 * ib;
device const uint8_t * sc = xr->scales + ib;
device const half * dh = &xr->d;
for (int row = 0; row < N_DST; row++) {
const float db = dh[0];
const uint8_t ls1 = sc[0] & 0xf;
const uint8_t ls2 = sc[0] >> 4;
const float d1 = db * (0.5f + ls1);
const float d2 = db * (0.5f + ls2);
float sum1 = 0, sum2 = 0;
for (int l = 0; l < 2; ++l) {
const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
const uint8_t signs = shared_signs[(q2[l] >> 9)];
for (int j = 0; j < 8; ++j) {
sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
}
}
for (int l = 2; l < 4; ++l) {
const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
const uint8_t signs = shared_signs[(q2[l] >> 9)];
for (int j = 0; j < 8; ++j) {
sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
}
}
sumf[row] += d1 * sum1 + d2 * sum2;
dh += nb*sizeof(block_iq2_xs)/2;
q2 += nb*sizeof(block_iq2_xs)/2;
sc += nb*sizeof(block_iq2_xs);
}
y4 += 32 * 32;
}
#else
// TODO
#endif
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
}
}
}
[[host_name("kernel_mul_mv_iq2_xs_f32")]]
kernel void kernel_mul_mv_iq2_xs_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
}
//============================= templates and their specializations =============================
// NOTE: this is not dequantizing - we are simply fitting the template
@ -3620,8 +4122,8 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
const half ml = 4.h * dl;
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
const float ml = 4.f * dl;
il = (il/2) & 3;
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
@ -3688,7 +4190,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
uint8_t ul = 1 << (il/2);
il = il & 3;
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float d = il < 2 ? xb->d : xb->d / 16.h;
const float d = il < 2 ? xb->d : xb->d / 16.f;
const float min = xb->dmin;
const float dl = d * sc[0];
const float ml = min * sc[1];
@ -3721,17 +4223,17 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
#if QK_K == 256
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
qh = qh + 32*(il/8) + 16*(il&1);
half sc = scales[(il%2) + 2 * ((il/2))];
float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3;
#else
ql = ql + 16 * (il&1);
half sc = scales[il];
float sc = scales[il];
#endif
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const half coef = il>1 ? 1.f/16.h : 1.h;
const half ml = d_all * sc * 32.h;
const half dl = d_all * sc * coef;
const float coef = il>1 ? 1.f/16.f : 1.f;
const float ml = d_all * sc * 32.f;
const float dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) {
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
@ -3739,6 +4241,52 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
}
}
template <typename type4x4>
void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
const float d = xb->d;
const int ib32 = il/2;
il = il%2;
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
// each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
device const uint16_t * q2 = xb->qs + 4*ib32;
const uint32_t aux32_g = q2[0] | (q2[1] << 16);
const uint32_t aux32_s = q2[2] | (q2[3] << 16);
thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
for (int i = 0; i < 8; ++i) {
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
}
grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
for (int i = 0; i < 8; ++i) {
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
}
}
template <typename type4x4>
void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
const float d = xb->d;
const int ib32 = il/2;
il = il%2;
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
device const uint16_t * q2 = xb->qs + 4*ib32;
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
for (int i = 0; i < 8; ++i) {
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
}
grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
signs = ksigns_iq2xs[q2[2*il+1] >> 9];
for (int i = 0; i < 8; ++i) {
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
}
}
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
kernel void kernel_get_rows(
device const void * src0,
@ -4278,6 +4826,8 @@ template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows
template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
//
// matrix-matrix multiplication
@ -4314,6 +4864,8 @@ template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
//
// indirect matrix-matrix multiplication
@ -4362,6 +4914,8 @@ template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mu
template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K, QK_NL, dequantize_q4_K>;
template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
//
// matrix-vector multiplication
@ -5134,3 +5688,133 @@ kernel void kernel_mul_mv_id_q6_K_f32(
tiisg,
sgitg);
}
[[host_name("kernel_mul_mv_id_iq2_xxs_f32")]]
kernel void kernel_mul_mv_id_iq2_xxs_f32(
device const char * ids,
device const char * src1,
device float * dst,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
device const char * src00,
device const char * src01,
device const char * src02,
device const char * src03,
device const char * src04,
device const char * src05,
device const char * src06,
device const char * src07,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiitg[[thread_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
const int64_t bid = tgpig.z/(ne12*ne13);
tgpig.z = tgpig.z%(ne12*ne13);
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
kernel_mul_mv_iq2_xxs_f32_impl(
src0[id],
(device const float *) (src1 + bid*nb11),
dst + bid*ne0,
ne00,
ne01,
ne02,
ne10,
ne12,
ne0,
ne1,
r2,
r3,
shared_values,
tgpig,
tiisg,
sgitg);
}
[[host_name("kernel_mul_mv_id_iq2_xs_f32")]]
kernel void kernel_mul_mv_id_iq2_xs_f32(
device const char * ids,
device const char * src1,
device float * dst,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
device const char * src00,
device const char * src01,
device const char * src02,
device const char * src03,
device const char * src04,
device const char * src05,
device const char * src06,
device const char * src07,
threadgroup int8_t * shared_values [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiitg[[thread_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
const int64_t bid = tgpig.z/(ne12*ne13);
tgpig.z = tgpig.z%(ne12*ne13);
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
kernel_mul_mv_iq2_xs_f32_impl(
src0[id],
(device const float *) (src1 + bid*nb11),
dst + bid*ne0,
ne00,
ne01,
ne02,
ne10,
ne12,
ne0,
ne1,
r2,
r3,
shared_values,
tgpig,
tiisg,
sgitg);
}

View file

@ -1,5 +1,6 @@
#include "ggml.h"
#include "ggml-opencl.h"
#include "ggml-backend-impl.h"
#include <array>
#include <atomic>
@ -10,7 +11,7 @@
#include <sstream>
#include <vector>
#define CL_TARGET_OPENCL_VERSION 110
#define CL_TARGET_OPENCL_VERSION 120
#include <clblast.h>
#if defined(_MSC_VER)
@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
}
void ggml_cl_init(void) {
static bool initialized = false;
if (initialized) {
return;
}
initialized = true;
cl_int err;
struct cl_device;
@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
} else {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
}
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0;
@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
if (src1->backend == GGML_BACKEND_CPU) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
}
CL_CHECK(clFinish(queue));
@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
if (dst->backend == GGML_BACKEND_CPU) {
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
}
}
}
@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_X, x_size);
}
ggml_cl_pool_free(d_Y, y_size);
ggml_cl_pool_free(d_D, d_size);
if (src1->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_Y, y_size);
}
if (dst->backend != GGML_BACKEND_GPU) {
ggml_cl_pool_free(d_D, d_size);
}
}
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
}
// FIXME: convert on device
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// convert src1 to fp16
// TODO: use multiple threads
@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
}
// copy dst to host, then convert to float
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
if (dst->backend == GGML_BACKEND_CPU) {
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
} else {
// FIXME: convert dst to fp32 on device
}
}
}
}
@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
}
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}
// ggml-backend
// buffer
struct ggml_backend_opencl_buffer_context {
~ggml_backend_opencl_buffer_context() {
if (buffer) {
clReleaseMemObject(buffer);
}
for (auto * sub_buffer : sub_buffers) {
clReleaseMemObject(sub_buffer);
}
}
cl_mem buffer;
std::vector<cl_mem> sub_buffers;
};
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
return "OpenCL";
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
}
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
return cl_ptr_base;
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
tensor->extra = tensor->view_src->extra;
} else {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
cl_int err;
cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
ctx->sub_buffers.push_back(sub_buffer);
tensor->extra = sub_buffer;
}
tensor->backend = GGML_BACKEND_GPU;
}
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
cl_mem tensor_buffer = (cl_mem) tensor->extra;
CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
cl_mem tensor_buffer = (cl_mem) tensor->extra;
CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
}
static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
for (auto * sub_buffer : ctx->sub_buffers) {
clReleaseMemObject(sub_buffer);
}
ctx->sub_buffers.clear();
}
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_get_name,
/* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
/* .get_base = */ ggml_backend_opencl_buffer_get_base,
/* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_opencl_buffer_clear,
/* .reset = */ ggml_backend_opencl_buffer_reset,
};
// buffer type
static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
return "OpenCL";
GGML_UNUSED(buffer_type);
}
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
ggml_cl_init();
cl_int err;
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
if (err != CL_SUCCESS) {
fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
return nullptr;
}
ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
}
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
// FIXME: not thread safe, device may not be initialized yet
static cl_uint alignment = -1;
if (alignment == (cl_uint)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
}
return alignment;
GGML_UNUSED(buffer_type);
}
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buffer_type);
}
static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
static ggml_backend_buffer_type buffer_type = {
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
/* .context = */ nullptr,
};
return &buffer_type;
}
#if 0
// host buffer type
static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return "CL_Host";
GGML_UNUSED(buft);
}
static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
return "CL_Host";
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_cl_host_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr = ggml_cl_host_malloc(size);
if (ptr == nullptr) {
// fallback to cpu buffer
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
/* .iface = */ {
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
};
return &ggml_backend_opencl_buffer_type_host;
}
// backend
static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
return "OpenCL";
GGML_UNUSED(backend);
}
static void ggml_backend_opencl_free(ggml_backend_t backend) {
GGML_UNUSED(backend);
}
static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_opencl_buffer_type();
GGML_UNUSED(backend);
}
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
break;
case GGML_OP_MUL:
ggml_cl_mul(node->src[0], node->src[1], node);
break;
default:
GGML_ASSERT(false);
}
}
return true;
GGML_UNUSED(backend);
}
static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_MUL_MAT:
return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
case GGML_OP_MUL:
// return ggml_can_repeat_rows(op->src[1], op->src[0]);
return true;
default:
return false;
}
GGML_UNUSED(backend);
}
static ggml_backend_i opencl_backend_i = {
/* .get_name = */ ggml_backend_opencl_name,
/* .free = */ ggml_backend_opencl_free,
/* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_from_async = */ NULL,
/* .cpy_tensor_to_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
/* .supports_op = */ ggml_backend_opencl_supports_op,
};
ggml_backend_t ggml_backend_opencl_init() {
ggml_backend_t backend = new ggml_backend {
/* .interface = */ opencl_backend_i,
/* .context = */ nullptr
};
return backend;
}
bool ggml_backend_is_opencl(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_opencl_name;
}
#endif

View file

@ -1,6 +1,7 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
@ -9,17 +10,26 @@ extern "C" {
GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
GGML_API void * ggml_cl_host_malloc(size_t size);
GGML_API void ggml_cl_host_free(void * ptr);
// GGML_API void * ggml_cl_host_malloc(size_t size);
// GGML_API void ggml_cl_host_free(void * ptr);
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
// backend API
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -165,6 +165,22 @@ typedef struct {
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
// (Almost) "true" 2-bit quantization.
// Due to the need to use blocks as per ggml dsign, it ends up using
// 2.0625 bpw because of the 16-bit scale for each block of 256.
typedef struct {
ggml_fp16_t d;
uint16_t qs[QK_K/8];
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
// 2.3125 bpw quants
typedef struct {
ggml_fp16_t d;
uint16_t qs[QK_K/8];
uint8_t scales[QK_K/32];
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
// Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
@ -209,6 +225,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@ -222,3 +240,20 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
//
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
//
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);

View file

@ -48,7 +48,7 @@
#define VK_NUM_TYPES 16
#define VK_MAX_NODES 8192
#define GGML_VK_MAX_NODES 8192
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
@ -193,7 +193,10 @@ struct ggml_tensor_extra_gpu {
bool buffer_static;
vk_buffer * buffer_gpu;
uint64_t offset;
uint64_t base_buffer_offset;
uint64_t view_offset;
bool prepared;
};
struct ggml_vk_garbage_collector {
@ -455,7 +458,7 @@ static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk_semapho
static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk::Fence fence) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_submit(" << q.queue_family_index << ", (" << q.queue << "), " << sequences.size() << ")" << std::endl;
std::cerr << "ggml_vk_submit(" << q.queue_family_index << " (" << q.queue << "), " << sequences.size() << ", " << fence << ")" << std::endl;
#endif
if (sequences.empty()) {
return;
@ -941,10 +944,18 @@ static void ggml_vk_load_shaders() {
vk_pipeline_rope_f16 = ggml_vk_create_pipeline("rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
}
void ggml_vk_init(void) {
void ggml_vk_init() {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_init()" << std::endl;
#endif
static bool initialized = false;
if (initialized) {
return;
}
initialized = true;
const char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
@ -1659,12 +1670,12 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
vk_context ctx;
ggml_vk_ctx_begin(ctx, q);
vk_context * ctx = ggml_vk_create_context();
ggml_vk_ctx_begin(*ctx, q);
std::vector<vk_staging_memcpy> staging;
ggml_vk_buffer_read_async(ctx, src, offset, dst, size, q, &staging);
ggml_vk_ctx_end(ctx);
ggml_vk_submit(q, ctx.seqs, vk_fence);
ggml_vk_buffer_read_async(*ctx, src, offset, dst, size, q, &staging);
ggml_vk_ctx_end(*ctx);
ggml_vk_submit(q, ctx->seqs, vk_fence);
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, uint64_t(-1)), "vk_buffer_read waitForFences");
vk_device.device.resetFences({ vk_fence });
@ -1674,6 +1685,21 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
}
}
static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size, vk_queue& q) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_buffer_copy(" << size << ")" << std::endl;
#endif
VkBufferCopy bc{ src_offset, dst_offset, size };
vk_context * ctx = ggml_vk_create_context();
ggml_vk_ctx_begin(*ctx, q);
vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
ggml_vk_ctx_end(*ctx);
ggml_vk_submit(q, ctx->seqs, vk_fence);
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, uint64_t(-1)), "vk_buffer_copy waitForFences");
vk_device.device.resetFences({ vk_fence });
}
static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size, vk_queue& q) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
@ -1984,7 +2010,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer* d_D = extra->buffer_gpu;
const uint64_t d_buf_offset = extra->offset;
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
GGML_ASSERT(d_D != nullptr);
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
vk_buffer* d_Qx;
@ -2000,7 +2026,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
GGML_ASSERT(d_Qx->size >= qx_sz * ne02 * ne03);
} else {
d_Qx = extra_src0->buffer_gpu;
qx_buf_offset = extra_src0->offset;
qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
GGML_ASSERT(d_Qx != nullptr);
}
if (load_y) {
@ -2008,7 +2034,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
GGML_ASSERT(d_Qy->size >= qy_sz * ne02 * ne03);
} else {
d_Qy = extra_src1->buffer_gpu;
qy_buf_offset = extra_src1->offset;
qy_buf_offset = extra_src1->base_buffer_offset + extra_src1->view_offset;
GGML_ASSERT(d_Qy != nullptr);
}
if (qx_needs_dequant) {
@ -2153,7 +2179,7 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer* d_D = extra->buffer_gpu;
const uint64_t d_buf_offset = extra->offset;
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
GGML_ASSERT(d_D != nullptr);
vk_buffer* d_Qx;
uint32_t qx_buf_offset = 0;
@ -2167,14 +2193,14 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
d_Qx = &vk_prealloc_qx;
} else {
d_Qx = extra_src0->buffer_gpu;
qx_buf_offset = extra_src0->offset;
qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
GGML_ASSERT(d_Qx != nullptr);
}
if (load_y) {
d_Qy = &vk_prealloc_qy;
} else {
d_Qy = extra_src1->buffer_gpu;
qy_buf_offset = extra_src1->offset;
qy_buf_offset = extra_src1->base_buffer_offset + extra_src1->view_offset;
GGML_ASSERT(d_Qy != nullptr);
}
if (qx_needs_dequant) {
@ -2316,10 +2342,10 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer* d_D = extra->buffer_gpu;
const uint64_t d_buf_offset = extra->offset;
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
GGML_ASSERT(d_D != nullptr);
vk_buffer* d_Qx;
const uint32_t qx_buf_offset = extra_src0->offset;
const uint32_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
vk_buffer* d_Qy;
uint32_t qy_buf_offset = 0;
d_Qx = extra_src0->buffer_gpu;
@ -2328,7 +2354,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
d_Qy = &vk_prealloc_qy;
} else {
d_Qy = extra_src1->buffer_gpu;
qy_buf_offset = extra_src1->offset;
qy_buf_offset = extra_src1->base_buffer_offset + extra_src1->view_offset;
GGML_ASSERT(d_Qx != nullptr);
}
@ -2408,10 +2434,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer* d_D = extra->buffer_gpu;
const uint64_t d_buf_offset = extra->offset;
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
GGML_ASSERT(d_D != nullptr);
vk_buffer* d_Qx;
const uint32_t qx_buf_offset = extra_src0->offset;
const uint32_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
vk_buffer* d_Qy;
uint32_t qy_buf_offset = 0;
d_Qx = extra_src0->buffer_gpu;
@ -2420,7 +2446,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
d_Qy = &vk_prealloc_qy;
} else {
d_Qy = extra_src1->buffer_gpu;
qy_buf_offset = extra_src1->offset;
qy_buf_offset = extra_src1->base_buffer_offset + extra_src1->view_offset;
GGML_ASSERT(d_Qx != nullptr);
}
@ -2516,9 +2542,9 @@ static void ggml_vk_op_repeat(vk_context& ctx, const ggml_tensor * src0, const g
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
const vk_buffer* src_buf = extra_src0->buffer_gpu;
const uint64_t src_offset = extra_src0->offset;
const uint64_t src_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
vk_buffer* dst_buf = extra->buffer_gpu;
const uint64_t dst_offset = extra->offset;
const uint64_t dst_offset = extra->base_buffer_offset + extra->view_offset;
std::vector<vk::BufferCopy> copies;
@ -2719,8 +2745,8 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
vk_buffer* d_D = extra->buffer_gpu;
GGML_ASSERT(d_D != nullptr);
uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
uint64_t d_buf_offset = ((extra->base_buffer_offset + extra->view_offset) / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
GGML_ASSERT(d_buf_offset == (extra->base_buffer_offset + extra->view_offset) || op == GGML_OP_CPY); // NOLINT
vk_buffer* d_X = nullptr;
uint64_t x_buf_offset = 0;
vk_buffer* d_Y = nullptr;
@ -2729,14 +2755,14 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
d_X = &vk_prealloc_qx;
} else {
d_X = extra_src0->buffer_gpu;
x_buf_offset = extra_src0->offset;
x_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
GGML_ASSERT(d_X != nullptr);
}
if (transfer_src1) {
d_Y = &vk_prealloc_qy;
} else if (use_src1) {
d_Y = extra_src1->buffer_gpu;
y_buf_offset = extra_src1->offset;
y_buf_offset = extra_src1->base_buffer_offset + extra_src1->view_offset;
GGML_ASSERT(d_Y != nullptr);
}
@ -2745,7 +2771,7 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
GGML_ASSERT(!transfer_src1);
d_sz = dst->ne[1] * dst->nb[1];
if (extra->offset + d_sz >= d_D->size) {
if (extra->base_buffer_offset + extra->view_offset + d_sz >= d_D->size) {
d_sz = VK_WHOLE_SIZE;
}
}
@ -2886,7 +2912,7 @@ static void ggml_vk_cpy(vk_context& ctx, const ggml_tensor * src0, ggml_tensor *
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const int src0_type_size = ggml_type_size(src0->type);
const int dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = (extra->offset % vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
const uint32_t d_offset = ((extra->base_buffer_offset + extra->view_offset) % vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
ggml_vk_op_f32<vk_op_cpy_push_constants>(ctx, src0, nullptr, dst, GGML_OP_CPY, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size,
@ -3445,18 +3471,19 @@ void ggml_vk_test_transfer(size_t ne, bool pinned) {
}
#endif
static void ggml_vk_transform_tensor(void * data, ggml_tensor * tensor, bool buffer_static) {
static void ggml_vk_transform_tensor(const void * data, ggml_tensor * tensor, bool buffer_static) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_transform_tensor(" << data << ", " << tensor << " (" << tensor->name << "))" << std::endl;
#endif
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
GGML_ASSERT(ggml_is_contiguous(tensor));
ggml_tensor_extra_gpu * extra;
GGML_ASSERT(tensor->extra == nullptr);
extra = new ggml_tensor_extra_gpu;
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
tensor->extra = extra;
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
if (extra == nullptr) {
extra = new ggml_tensor_extra_gpu;
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
tensor->extra = extra;
}
const size_t size = ggml_nbytes(tensor);
@ -3471,10 +3498,10 @@ static void ggml_vk_transform_tensor(void * data, ggml_tensor * tensor, bool buf
}
}
void ggml_vk_transform_tensor_temporary(void * data, ggml_tensor * tensor) {
void ggml_vk_transform_tensor_temporary(const void * data, ggml_tensor * tensor) {
ggml_vk_transform_tensor(data, tensor, false);
}
void ggml_vk_transform_tensor_static(void * data, ggml_tensor * tensor) {
void ggml_vk_transform_tensor_static(const void * data, ggml_tensor * tensor) {
ggml_vk_transform_tensor(data, tensor, true);
}
@ -3502,6 +3529,7 @@ static void ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
#endif
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
extra->d_idx = -1;
tensor->extra = extra;
}
@ -3620,9 +3648,6 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
#ifdef VK_DEBUG
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
#endif
const bool src0_gpu = false; // node->src[0] != nullptr && node->src[0]->ne[1] > 32 && node->src[0]->extra != nullptr && node->src[0]->backend == GGML_BACKEND_CPU;
const bool src1_gpu = false; // node->src[1] != nullptr && node->src[1]->ne[1] > 32 && node->src[1]->extra != nullptr && node->src[1]->backend == GGML_BACKEND_CPU;
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
@ -3635,6 +3660,8 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
ggml_vk_tensor_create_extra(node);
}
((ggml_tensor_extra_gpu *) node->extra)->prepared = true;
ggml_tensor * src0 = node->src[0];
ggml_tensor * src1 = node->src[1];
@ -3687,48 +3714,6 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
d_sz = ggml_vk_align_size(node->ne[1] * node->nb[1], vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
}
// Block buffers for reuse early
switch (node->op) {
case GGML_OP_REPEAT:
case GGML_OP_GET_ROWS:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_ADD:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_MUL:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_MUL_MAT:
if (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node)) {
return;
}
// Reuse GPU buffer if previous op is also on GPU
if (src0_gpu) {
// std::cerr << "Offloading " << src0 << " (" << ggml_op_name(src0->op) << ") to GPU because of " << src0->extra << std::endl;
src0->backend = GGML_BACKEND_GPU;
// Replace with data GPU tensor
vk_prealloc_d_blocked[((ggml_tensor_extra_gpu *)src0->extra)->d_idx] = ggml_vk_find_last_use(src0, graph);
}
if (src1_gpu) {
// std::cerr << "Offloading " << src1 << " (" << ggml_op_name(src1->op) << ") to GPU because of " << src1->extra << std::endl;
src1->backend = GGML_BACKEND_GPU;
// Replace with data GPU tensor
vk_prealloc_d_blocked[((ggml_tensor_extra_gpu *)src1->extra)->d_idx] = ggml_vk_find_last_use(src1, graph);
}
default:
break;
}
// -1 means pick from preallocated buffers
// -2 means don't pick, it has already been assigned
int inplace = -1;
@ -3749,7 +3734,8 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
inplace = extra->d_idx;
}
extra->tensor_size = extra_src->tensor_size;
extra->offset = node->view_offs;
extra->base_buffer_offset = extra_src->base_buffer_offset;
extra->view_offset = node->view_offs;
}
switch (node->op) {
@ -3801,20 +3787,11 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
if (node->backend == GGML_BACKEND_GPU && extra->d_idx >= 0) {
if (node->backend == GGML_BACKEND_GPU && extra->d_idx >= 0 && graph != nullptr) {
// Replace with data GPU tensor
vk_prealloc_d_blocked[extra->d_idx] = ggml_vk_find_last_use(node, graph);
}
// std::cerr << "Created extra " << node->extra << " for " << node << " (" << ggml_op_name(node->op) << ") with";
// if (src0 != nullptr) {
// std::cerr << " src0=" << src0 << " (" << ggml_op_name(src0->op) << ")";
// }
// if (src1 != nullptr) {
// std::cerr << " src1=" << src1 << " (" << ggml_op_name(src1->op) << ")";
// }
// std::cerr << std::endl;
// Unblock buffers if they terminate at current node
for (size_t i = 0; i < vk_prealloc_d_blocked.size(); i++) {
if (vk_prealloc_d_blocked[i] == node) {
@ -3824,6 +3801,15 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
}
void ggml_vk_preallocate_buffers() {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_preallocate_buffers()" << std::endl;
std::cerr << "d_sizes: ";
for (size_t i = 0; i < vk_prealloc_d_sizes.size(); i++) {
std::cerr << vk_prealloc_d_sizes[i] << " ";
}
std::cerr << std::endl;
std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << std::endl;
#endif
#if defined(VK_RUN_TESTS)
ggml_vk_test_transfer(8192, false);
ggml_vk_test_transfer(8192, true);
@ -3918,7 +3904,7 @@ void ggml_vk_preallocate_buffers() {
}
}
void ggml_vk_build_graph(ggml_tensor * node){
void ggml_vk_build_graph(ggml_tensor * node, bool last_node){
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
@ -3928,7 +3914,7 @@ void ggml_vk_build_graph(ggml_tensor * node){
}
#ifdef VK_DEBUG
std::cerr << "ggml_vk_build_graph(" << node << ")" << std::endl;
std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
#endif
vk_semaphore_idx = 0;
vk_staging_offset = 0;
@ -3944,6 +3930,47 @@ void ggml_vk_build_graph(ggml_tensor * node){
// This can't be done earlier cause the buffer may not exist yet
if (extra->d_idx >= 0) {
extra->buffer_gpu = &vk_prealloc_d_buffers[extra->d_idx];
extra->base_buffer_offset = 0;
}
switch (node->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_RELU:
break;
default:
return;
}
break;
case GGML_OP_REPEAT:
case GGML_OP_GET_ROWS:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_MUL_MAT:
break;
default:
if (any_on_device) {
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ASSERT(false);
}
return;
}
if (vk_ctx == nullptr) {
@ -4029,17 +4056,17 @@ void ggml_vk_build_graph(ggml_tensor * node){
break;
default:
if (any_on_device) {
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ASSERT(false);
}
return;
}
extra->ready = true;
extra->ctx_idx = vk_ctx->idx;
if (node->backend == GGML_BACKEND_CPU) {
#ifdef GGML_VULKAN_CHECK_RESULTS
last_node = true;
#endif
if (node->backend == GGML_BACKEND_CPU || last_node) {
ggml_vk_ctx_end(*vk_ctx);
vk_ctx->exit_tensor = node;
vk_ctx = nullptr;
@ -4223,13 +4250,224 @@ void ggml_vk_cleanup() {
#define UNUSED GGML_UNUSED
struct ggml_backend_vk_context {
std::string name;
};
// device backend
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;
struct ggml_backend_vk_buffer_context {
// vk_buffer dev_buffer;
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
size_t temp_tensor_extra_index = 0;
std::string name;
ggml_backend_vk_buffer_context() :
name(GGML_VK_NAME) {
}
~ggml_backend_vk_buffer_context() {
delete[] temp_tensor_extras;
}
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) {
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
}
size_t alloc_index = temp_tensor_extra_index;
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
return extra;
}
};
GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
return ctx->name.c_str();
}
GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
}
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
// ggml_vk_pool_free(ctx->dev_buffer);
delete ctx;
}
GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
return vk_ptr_base;
UNUSED(buffer);
}
GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
#ifdef VK_DEBUG
std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << ", " << tensor << ")" << std::endl;
#endif
// ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
// if (tensor->view_src != NULL && tensor->view_offs == 0) {
// assert(tensor->view_src->buffer->buft == buffer->buft);
// tensor->backend = tensor->view_src->backend;
// tensor->extra = tensor->view_src->extra;
// return;
// }
// ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
// extra->buffer_gpu = &ctx->dev_buffer;
// extra->base_buffer_offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
tensor->backend = GGML_BACKEND_GPU;
// tensor->extra = extra;
if (tensor->op == GGML_OP_NONE) {
ggml_vk_assign_buffer(tensor);
} else {
ggml_vk_preallocate_buffers_graph(tensor, nullptr);
}
}
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
#ifdef VK_DEBUG
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
#endif
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
// ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
// ggml_vk_buffer_write(&ctx->dev_buffer, offset, data, size, vk_device.transfer_queue);
ggml_vk_transform_tensor_static(data, tensor);
}
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
#ifdef VK_DEBUG
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
#endif
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
ggml_vk_buffer_read(extra->buffer_gpu, offset, data, size, vk_device.transfer_queue);
}
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
// if (ggml_backend_buffer_is_vk(src->buffer)) {
// ggml_backend_vk_buffer_context * src_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
// ggml_backend_vk_buffer_context * dst_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
// ggml_vk_buffer_copy(&dst_ctx->dev_buffer, 0, &src_ctx->dev_buffer, 0, ggml_nbytes(src), vk_device.transfer_queue);
// return true;
// }
return false;
}
GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
// ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size, vk_device.transfer_queue);
}
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
/* .get_name = */ ggml_backend_vk_buffer_get_name,
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
/* .get_base = */ ggml_backend_vk_buffer_get_base,
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
/* .cpy_tensor = */ NULL, // ggml_backend_vk_buffer_cpy_tensor,
/* .clear = */ ggml_backend_vk_buffer_clear,
/* .reset = */ NULL,
};
// vk buffer type
struct ggml_backend_vk_buffer_type_context {
std::string name;
};
GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
return ctx->name.c_str();
}
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// vk_buffer dev_buffer = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context();
return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, ctx, size);
UNUSED(buft);
}
GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return vk_device.properties.limits.minStorageBufferOffsetAlignment;
UNUSED(buft);
}
GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
return ggml_nbytes(tensor);
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_vk(backend);
}
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
/* .get_name = */ ggml_backend_vk_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type() {
static ggml_backend_buffer_type ggml_backend_vk_buffer_type;
static bool ggml_backend_vk_buffer_type_initialized = false;
if (!ggml_backend_vk_buffer_type_initialized) {
ggml_backend_vk_buffer_type = {
/* .iface = */ ggml_backend_vk_buffer_type_interface,
/* .context = */ new ggml_backend_vk_buffer_type_context{GGML_VK_NAME},
};
ggml_backend_vk_buffer_type_initialized = true;
}
return &ggml_backend_vk_buffer_type;
}
// host buffer type
static void ggml_backend_vulkan_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return GGML_VK_NAME "_Host";
UNUSED(buft);
}
GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
return GGML_VK_NAME "_Host";
UNUSED(buffer);
}
GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_vk_host_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_vulkan_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr = ggml_vk_host_malloc(size);
if (ptr == nullptr) {
@ -4237,19 +4475,26 @@ static ggml_backend_buffer_t ggml_backend_vulkan_host_buffer_type_alloc_buffer(g
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_vulkan_host_buffer_free_buffer;
buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_vulkan_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_vulkan_buffer_type_host = {
GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return vk_device.properties.limits.minMemoryMapAlignment;
UNUSED(buft);
}
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_vulkan_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
@ -4257,7 +4502,213 @@ ggml_backend_buffer_type_t ggml_backend_vulkan_host_buffer_type() {
/* .context = */ nullptr,
};
return &ggml_backend_vulkan_buffer_type_host;
return &ggml_backend_vk_buffer_type_host;
}
// backend
GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
return vk_ctx->name.c_str();
}
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
delete vk_ctx;
delete backend;
}
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_vk_buffer_type();
UNUSED(backend);
}
GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
for (int i = 0; i < cgraph->n_leafs; i++) {
ggml_tensor * node = cgraph->leafs[i];
if (node->backend == GGML_BACKEND_GPU && node->extra == nullptr) {
ggml_vk_transform_tensor_temporary(node->data, node);
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i], cgraph);
}
ggml_vk_preallocate_buffers();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
ggml_compute_params params = {};
params.type = GGML_TASK_COMPUTE;
params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}
bool ok = ggml_vk_compute_forward(&params, node);
if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
#ifdef GGML_VULKAN_CHECK_RESULTS
else {
ggml_vk_check_results_1(&params, node);
}
#endif
GGML_ASSERT(ok);
}
ggml_vk_graph_cleanup();
return true;
}
GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
return true;
default:
return false;
}
break;
case GGML_OP_MUL_MAT:
{
struct ggml_tensor * a;
struct ggml_tensor * b;
if (op->op == GGML_OP_MUL_MAT) {
a = op->src[0];
b = op->src[1];
} else {
a = op->src[2];
b = op->src[1];
}
if (a->ne[3] != b->ne[3]) {
return false;
}
return true;
} break;
case GGML_OP_GET_ROWS:
{
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
return true;
default:
return false;
}
} break;
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
ggml_type src1_type = op->src[1]->type;
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
return true;
}
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
return true;
}
return false;
} break;
case GGML_OP_DUP:
case GGML_OP_REPEAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ALIBI:
return true;
default:
return false;
}
UNUSED(backend);
}
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free,
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op,
};
GGML_CALL ggml_backend_t ggml_backend_vk_init() {
ggml_vk_init(); // TODO: remove from ggml.c
ggml_backend_vk_context * ctx = new ggml_backend_vk_context {
/* .name = */ GGML_VK_NAME,
};
ggml_backend_t vk_backend = new ggml_backend {
/* .interface = */ ggml_backend_vk_interface,
/* .context = */ ctx
};
return vk_backend;
}
GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_vk_name;
}
// backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
ggml_backend_t vk_backend = ggml_backend_vk_init();
return vk_backend;
UNUSED(params);
UNUSED(user_data);
}
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
GGML_CALL int ggml_backend_vk_reg_devices() {
ggml_backend_register(GGML_VK_NAME, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(), nullptr);
return 1;
}
// checks
@ -4422,7 +4873,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
} else if (src0->backend == GGML_BACKEND_GPU) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
uint64_t offset = extra->offset;
uint64_t offset = extra->base_buffer_offset + extra->view_offset;
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@ -4465,7 +4916,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
} else if (src1->backend == GGML_BACKEND_GPU) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
uint64_t offset = extra->offset;
uint64_t offset = extra->base_buffer_offset + extra->view_offset;
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@ -4635,11 +5086,11 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
if (extra->offset + tensor_size >= extra->buffer_gpu->size) {
tensor_size = extra->buffer_gpu->size - extra->offset;
if (extra->base_buffer_offset + extra->view_offset + tensor_size >= extra->buffer_gpu->size) {
tensor_size = extra->buffer_gpu->size - (extra->base_buffer_offset + extra->view_offset);
}
ggml_vk_buffer_read(extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue);
ggml_vk_buffer_read(extra->buffer_gpu, extra->base_buffer_offset + extra->view_offset, tensor_data, tensor_size, vk_device.transfer_queue);
}
float first_error_result = -1.0f;

Some files were not shown because too many files have changed in this diff Show more