Merge branch 'master' into HEAD

This commit is contained in:
Georgi Gerganov 2023-09-28 21:34:26 +03:00
commit 546112944a
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
61 changed files with 3538 additions and 948 deletions

View file

@ -406,6 +406,7 @@ jobs:
id: cuda-toolkit id: cuda-toolkit
with: with:
cuda: ${{ matrix.cuda }} cuda: ${{ matrix.cuda }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Build - name: Build
@ -456,21 +457,22 @@ jobs:
path: | path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
freeBSD-latest: # freeBSD-latest:
runs-on: macos-12 # runs-on: macos-12
steps: # steps:
- name: Clone # - name: Clone
uses: actions/checkout@v3 # uses: actions/checkout@v3
#
- name: Build # - name: Build
uses: cross-platform-actions/action@v0.19.0 # uses: cross-platform-actions/action@v0.19.0
with: # with:
operating_system: freebsd # operating_system: freebsd
version: '13.2' # version: '13.2'
run: | # hypervisor: 'qemu'
sudo pkg update # run: |
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas # sudo pkg update
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 # sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

4
.gitignore vendored
View file

@ -51,7 +51,11 @@ models-mnt
/save-load-state /save-load-state
/server /server
/simple /simple
/batched
/export-lora
/finetune
/speculative /speculative
/parallel
/train-text-from-scratch /train-text-from-scratch
/vdot /vdot
build-info.h build-info.h

View file

@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
@ -116,7 +118,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
add_custom_command( add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git" COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index" DEPENDS "${GIT_DIR}/index"
VERBATIM VERBATIM
@ -160,6 +162,8 @@ if (APPLE AND LLAMA_ACCELERATE)
message(STATUS "Accelerate framework found") message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE) add_compile_definitions(GGML_USE_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
else() else()
message(WARNING "Accelerate framework not found") message(WARNING "Accelerate framework not found")
@ -304,6 +308,7 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_F16) add_compile_definitions(GGML_CUDA_F16)
endif() endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)

View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative finetune export-lora tests/test-c.o BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@ -305,6 +305,8 @@ ifndef LLAMA_NO_ACCELERATE
# `-framework Accelerate` works both with Apple Silicon and Mac Intel # `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
MK_LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
@ -368,6 +370,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
else
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
#ifdef LLAMA_CUDA_CUBLAS #ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS
@ -512,22 +519,25 @@ main: examples/main/main.cpp build-info.h ggml.
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS) simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS) embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
@ -567,6 +577,9 @@ export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o co
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS) metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -586,7 +599,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@ ./$@

View file

@ -45,6 +45,8 @@ let package = Package(
.unsafeFlags(["-Wno-shorten-64-to-32"]), .unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"), .define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE")
.define("ACCELERATE_NEW_LAPACK")
.define("ACCELERATE_LAPACK_ILP64")
] + additionalSettings, ] + additionalSettings,
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")

View file

@ -11,6 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
**Devs should become familiar with the new API**
- Local Falcon 180B inference on Mac Studio - Local Falcon 180B inference on Mac Studio
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
@ -90,6 +92,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
- [X] Mistral AI v0.1
**Bindings:** **Bindings:**
@ -392,12 +395,13 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). | | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---> --->
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------| |--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS - #### hipBLAS
@ -498,7 +502,7 @@ Building the program with BLAS support may lead to some performance improvements
```sh ```sh
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build . --config Release cmake --build . --config Release
``` ```
- CMake (Windows): - CMake (Windows):
@ -554,6 +558,10 @@ python3 convert.py models/7B/
# quantize the model to 4-bits (using q4_0 method) # quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
# update the gguf filetype to current if older version is unsupported by another application
./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY
# run the inference # run the inference
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
``` ```
@ -590,6 +598,11 @@ Several quantization methods are supported. They differ in the resulting model d
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 | | 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
### Perplexity (measuring model quality) ### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).

View file

@ -36,17 +36,20 @@ const Maker = struct {
} }
fn init(builder: *std.build.Builder) !Maker { fn init(builder: *std.build.Builder) !Maker {
const commit_hash = @embedFile(".git/refs/heads/master"); // const commit_hash = @embedFile(".git/refs/heads/master");
const target = builder.standardTargetOptions(.{});
const config_header = builder.addConfigHeader( const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" }, .{ .style = .blank, .include_path = "build-info.h" },
.{ .{
.BUILD_NUMBER = 0, .BUILD_NUMBER = 0,
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline .BUILD_COMMIT = "12345", // omit newline
.BUILD_COMPILER = "Zig 0.11.0",
.BUILD_TARGET = try target.allocDescription(builder.allocator),
}, },
); );
var m = Maker{ var m = Maker{
.builder = builder, .builder = builder,
.target = builder.standardTargetOptions(.{}), .target = target,
.optimize = builder.standardOptimizeOption(.{}), .optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header, .config_header = config_header,
.enable_lto = false, .enable_lto = false,
@ -58,7 +61,7 @@ const Maker = struct {
try m.addCFlag("-std=c11"); try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11"); try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{}); try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"examples"}); try m.addProjectInclude(&.{"common"});
return m; return m;
} }
@ -71,6 +74,7 @@ const Maker = struct {
o.addCSourceFiles(&.{src}, m.cxxflags.items); o.addCSourceFiles(&.{src}, m.cxxflags.items);
o.linkLibCpp(); o.linkLibCpp();
} }
o.addConfigHeader(m.config_header);
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto; o.want_lto = m.enable_lto;
return o; return o;
@ -104,15 +108,15 @@ pub fn build(b: *std.build.Builder) !void {
const ggml = make.obj("ggml", "ggml.c"); const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const llama = make.obj("llama", "llama.cpp"); const llama = make.obj("llama", "llama.cpp");
const common = make.obj("common", "examples/common.cpp"); const common = make.obj("common", "common/common.cpp");
const console = make.obj("common", "examples/console.cpp"); const console = make.obj("common", "common/console.cpp");
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
if (server.target.isWindows()) { if (server.target.isWindows()) {

View file

@ -317,6 +317,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_chunks = std::stoi(argv[i]); params.n_chunks = std::stoi(argv[i]);
} else if (arg == "-np" || arg == "--parallel") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parallel = std::stoi(argv[i]);
} else if (arg == "-ns" || arg == "--sequences") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_sequences = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -372,6 +384,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.multiline_input = true; params.multiline_input = true;
} else if (arg == "--simple-io") { } else if (arg == "--simple-io") {
params.simple_io = true; params.simple_io = true;
} else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true;
} else if (arg == "--color") { } else if (arg == "--color") {
params.use_color = true; params.use_color = true;
} else if (arg == "--mlock") { } else if (arg == "--mlock") {
@ -448,8 +462,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--numa") { } else if (arg == "--numa") {
params.numa = true; params.numa = true;
} else if (arg == "--export") {
params.export_cgraph = true;
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -468,8 +480,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
if (params.logdir.back() != DIRECTORY_SEPARATOR) { if (params.logdir.back() != DIRECTORY_SEPARATOR) {
params.logdir += DIRECTORY_SEPARATOR; params.logdir += DIRECTORY_SEPARATOR;
} }
} else if (arg == "--perplexity") { } else if (arg == "--perplexity" || arg == "--all-logits") {
params.perplexity = true; params.logits_all = true;
} else if (arg == "--ppl-stride") { } else if (arg == "--ppl-stride") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -659,20 +671,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
printf(" --perplexity compute perplexity over each ctx window of the prompt\n"); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
@ -697,7 +712,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
printf(" --export export the computation graph to 'llama.ggml'\n");
printf(" --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
@ -751,7 +765,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap; lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock; lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity; lparams.logits_all = params.logits_all;
lparams.embedding = params.embedding; lparams.embedding = params.embedding;
lparams.rope_freq_base = params.rope_freq_base; lparams.rope_freq_base = params.rope_freq_base;
lparams.rope_freq_scale = params.rope_freq_scale; lparams.rope_freq_scale = params.rope_freq_scale;
@ -800,8 +814,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
{ {
LOG("warming up the model with an empty run\n"); LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads); llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
llama_kv_cache_tokens_rm(lctx, -1, -1);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }
@ -908,7 +923,7 @@ llama_token llama_sample_token(
llama_token id = 0; llama_token id = 0;
float * logits = llama_get_logits(ctx) + idx * n_vocab; float * logits = llama_get_logits_ith(ctx, idx);
// Apply params.logit_bias map // Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@ -959,11 +974,11 @@ llama_token llama_sample_token(
if (mirostat == 1) { if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100; const int mirostat_m = 100;
llama_sample_temperature(ctx, &cur_p, temp); llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) { } else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &cur_p, temp); llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else { } else {
// Temperature sampling // Temperature sampling
@ -971,7 +986,7 @@ llama_token llama_sample_token(
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1); llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
llama_sample_typical (ctx, &cur_p, typical_p, 1); llama_sample_typical (ctx, &cur_p, typical_p, 1);
llama_sample_top_p (ctx, &cur_p, top_p, 1); llama_sample_top_p (ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp); llama_sample_temp(ctx, &cur_p, temp);
{ {
const int n_top = 10; const int n_top = 10;
@ -1200,7 +1215,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
@ -1287,6 +1301,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", params.temp); fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);

View file

@ -3,7 +3,6 @@
#pragma once #pragma once
#include "llama.h" #include "llama.h"
#include "build-info.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
@ -43,14 +42,16 @@ struct gpt_params {
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
// sampling parameters // sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
@ -108,16 +109,16 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = false; // insert new sequences for decoding on-the-fly
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
}; };
@ -184,7 +185,7 @@ std::string llama_detokenize_bpe(
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL // - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty // - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab // - idx: sample from llama_get_logits_ith(ctx, idx)
// //
// returns: // returns:
// - token: sampled token // - token: sampled token

View file

@ -1484,7 +1484,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
train->shuffle_next_sample = 0; train->shuffle_next_sample = 0;
} }
const bool last_epoch_reached = (params->n_epochs > 0 && train->train_epochs - data->first_epoch >= params->n_epochs); const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs);
if (last_epoch_reached) { if (last_epoch_reached) {
// allow optimization iteration at last epoch to be completed before canceling // allow optimization iteration at last epoch to be completed before canceling
if (data->iter_at_last_epoch < 0) { if (data->iter_at_last_epoch < 0) {

View file

@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head //= n_head_kv n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2) .swapaxes(1, 2)
.reshape(weights.shape)) .reshape(weights.shape))

View file

@ -48,7 +48,7 @@ make -j
According to the BLIS documentation, we could set the following According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp: environment variables to modify the behavior of openmp:
``` ```bash
export GOMP_GPU_AFFINITY="0-19" export GOMP_GPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14 export BLIS_NUM_THREADS=14
``` ```

View file

@ -24,7 +24,9 @@ else()
add_subdirectory(finetune) add_subdirectory(finetune)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(batched)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(parallel)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(beam-search) add_subdirectory(beam-search)

View file

@ -486,6 +486,14 @@ static struct ggml_tensor * forward(
struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * kc = kv_self.k;
struct ggml_tensor * vc = kv_self.v; struct ggml_tensor * vc = kv_self.v;
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -513,8 +521,8 @@ static struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1] // Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
// store key and value to memory // store key and value to memory
{ {
@ -714,9 +722,18 @@ static struct ggml_tensor * forward_batch(
struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * kc = kv_self.k;
struct ggml_tensor * vc = kv_self.v; struct ggml_tensor * vc = kv_self.v;
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// inpL shape [n_embd,N*n_batch,1] // inpL shape [n_embd,N*n_batch,1]
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
assert_shape_2d(inpL, n_embd, N*n_batch); assert_shape_2d(inpL, n_embd, N*n_batch);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -744,8 +761,8 @@ static struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch] // Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1003,6 +1020,14 @@ static struct ggml_tensor * forward_lora(
struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * kc = kv_self.k;
struct ggml_tensor * vc = kv_self.v; struct ggml_tensor * vc = kv_self.v;
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -1036,7 +1061,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wqb, model->layers[il].wqb,
cur)), cur)),
n_embd/n_head, n_head, N), n_embd/n_head, n_head, N),
n_past, n_rot, 0, 0); KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0, ggml_mul_mat(ctx0,
@ -1045,7 +1070,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wkb, model->layers[il].wkb,
cur)), cur)),
n_embd/n_head, n_head, N), n_embd/n_head, n_head, N),
n_past, n_rot, 0, 0); KQ_pos, n_rot, 0, 0);
// store key and value to memory // store key and value to memory
{ {

View file

@ -0,0 +1,5 @@
set(TARGET batched)
add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,44 @@
# llama.cpp/example/batched
The example demonstrates batched generation from a given prompt
```bash
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
...
main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
Hello my name is
main: generating 4 sequences ...
main: stream 0 finished
main: stream 1 finished
main: stream 2 finished
main: stream 3 finished
sequence 0:
Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
sequence 1:
Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
sequence 2:
Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
sequence 3:
Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
llama_print_timings: load time = 587.00 ms
llama_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second)
llama_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second)
llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
llama_print_timings: total time = 4156.04 ms
```

View file

@ -0,0 +1,246 @@
#include "common.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
return 1 ;
}
int n_parallel = 1;
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
params.prompt = argv[2];
}
if (argc >= 4) {
n_parallel = std::atoi(argv[3]);
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
// total length of the sequences including the prompt
const int n_len = 32;
// init LLM
llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_len*n_parallel; // FIXME: use n_kv_req instead (tokenize with model after #3301)
ctx_params.n_batch = std::max(n_len, n_parallel);
// ctx_params.n_gpu_layers = 99; // offload all layers to the GPU
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
fprintf(stderr, "\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
// create a llama_batch with size 512
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
// evaluate the initial prompt
batch.n_tokens = tokens_list.size();
for (int32_t i = 0; i < batch.n_tokens; i++) {
batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch, params.n_threads) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
}
if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
}
// main loop
// we will store the parallel decoded sequences in this vector
std::vector<std::string> streams(n_parallel);
// remember the batch index of the last token for each parallel sequence
// we need this to determine which logits to sample from
std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
int n_cur = batch.n_tokens;
int n_decode = 0;
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
// prepare the next batch
batch.n_tokens = 0;
// sample the next token for each parallel sequence / stream
for (int32_t i = 0; i < n_parallel; ++i) {
if (i_batch[i] < 0) {
// the stream has already finished
continue;
}
auto n_vocab = llama_n_vocab(ctx);
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
const int top_k = 40;
const float top_p = 0.9f;
const float temp = 0.4f;
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp (ctx, &candidates_p, temp);
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
}
continue;
}
// if there is only one stream, we print immediately to stdout
if (n_parallel == 1) {
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
}
streams[i] += llama_token_to_piece(ctx, new_token_id);
// push this new token for next evaluation
batch.token [batch.n_tokens] = new_token_id;
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = i;
batch.logits[batch.n_tokens] = true;
i_batch[i] = batch.n_tokens;
batch.n_tokens += 1;
n_decode += 1;
}
// all streams are finished
if (batch.n_tokens == 0) {
break;
}
n_cur += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
if (n_parallel > 1) {
LOG_TEE("\n");
for (int32_t i = 0; i < n_parallel; ++i) {
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
}
}
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx);
fprintf(stderr, "\n");
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -158,8 +158,9 @@ int main(int argc, char ** argv)
} }
std::cout << std::flush; std::cout << std::flush;
int n_past = llama_get_kv_cache_token_count(ctx); int n_past = 0;
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0), params.n_threads))
{ {
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1; return 1;

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
@ -20,7 +21,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) { if (plan.work_size > 0) {
@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan); ggml_graph_compute(graph, &plan);
} }
float tensor_sum_elements(const ggml_tensor * tensor) { static float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0; double sum = 0;
if (tensor->type==GGML_TYPE_F32) { if (tensor->type == GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) { for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) { for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
} }
} }
} }
return sum; return sum;
} }
void tensor_dump(const ggml_tensor * tensor, const char * name) { static void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type), tensor->type, ggml_type_name(tensor->type),
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@ -58,7 +59,7 @@ struct benchmark_params_struct {
int32_t n_iterations = 10; int32_t n_iterations = 10;
}; };
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
//printf("Memsize required = %i\n", sizex*sizex); //printf("Memsize required = %i\n", sizex*sizex);
// TODO: perform the bench for all types or for a user specified type
const ggml_type qtype = GGML_TYPE_Q4_1;
size_t ctx_size = 0; size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); ctx_size += sizex*sizey*ggml_type_sizef(qtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); ctx_size += sizex*sizey*ggml_type_sizef(qtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16; ctx_size += 1024*1024*16;
@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
ggml_set_f32(m2, 2.0f); ggml_set_f32(m2, 2.0f);
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); printf("\n------ Test 1 - Matrix Mult via F32 code\n");
// printf("Creating new tensor m11xm2\n"); // printf("Creating new tensor m11xm2\n");
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(gf.nodes[0]); TENSOR_DUMP(gf.nodes[0]);
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
int32_t nelements = sizex*sizey; int32_t nelements = sizex*sizey;
int32_t ne[2] = { sizex, sizey };
std::vector<int64_t> hist_cur(1 << 4, 0); std::vector<int64_t> hist_cur(1 << 4, 0);
// Set up a the benchmark matrices // Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n"); // printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
// Set up a the compute graph // Set up a the compute graph
// printf("Creating new tensor q31\n"); // printf("Creating new tensor q31\n");
@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines // Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n"); // printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
// printf("Creating new tensor q32\n"); // printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication // Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark // Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = abs(sum_of_Q4_result - sum_of_F32_reference); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
if (delta > allowed_delta) { if (delta > allowed_delta) {

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "embd-input.h" #include "embd-input.h"
@ -79,7 +80,8 @@ bool eval_float(void * model, float * input, int N){
if (n_eval > n_batch) { if (n_eval > n_batch) {
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
if (llama_decode(ctx, batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return false; return false;
} }
@ -100,7 +102,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
if (n_eval > params.n_batch) { if (n_eval > params.n_batch) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return false; return false;
} }
@ -182,11 +184,11 @@ llama_token sampling_id(struct MyModel* mymodel) {
if (mirostat == 1) { if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100; const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) { } else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else { } else {
// Temperature sampling // Temperature sampling
@ -194,7 +196,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1); llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1); llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p); id = llama_sample_token(ctx, &candidates_p);
} }
} }

View file

@ -1,3 +1,21 @@
# embedding # llama.cpp/example/embedding
TODO This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
```
### Windows:
```powershell
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
```
The above command will output space-separated float values.

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -77,7 +78,7 @@ int main(int argc, char ** argv) {
while (!embd_inp.empty()) { while (!embd_inp.empty()) {
int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return 1; return 1;
} }

View file

@ -200,12 +200,12 @@ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_
} }
if (params->fn_model_base == default_params.fn_model_base) { if (params->fn_model_base == default_params.fn_model_base) {
fprintf(stderr, "error: please specify a filename for model-base.\n", arg.c_str()); fprintf(stderr, "error: please specify a filename for model-base.\n");
export_lora_print_usage(argc, argv, &default_params); export_lora_print_usage(argc, argv, &default_params);
exit(1); exit(1);
} }
if (params->fn_model_out == default_params.fn_model_out) { if (params->fn_model_out == default_params.fn_model_out) {
fprintf(stderr, "error: please specify a filename for model-out.\n", arg.c_str()); fprintf(stderr, "error: please specify a filename for model-out.\n");
export_lora_print_usage(argc, argv, &default_params); export_lora_print_usage(argc, argv, &default_params);
exit(1); exit(1);
} }

View file

@ -624,15 +624,23 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
} }
}; };
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
(struct ggml_tensor * t) -> struct ggml_tensor * { (struct ggml_tensor * t) -> struct ggml_tensor * {
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int n_past = 0;
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_custom(ctx,
t, n_past, n_rot, rope_mode, n_ctx, t, KQ_pos, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale); rope_freq_base, rope_freq_scale);
}; };
@ -827,10 +835,6 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
std::vector<char> keybuf; std::vector<char> keybuf;
keybuf.resize(512); keybuf.resize(512);
auto kv = [&arch, &keybuf](const char * key) -> const char * {
snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
return keybuf.data();
};
GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
GGML_ASSERT(arch == "llama"); GGML_ASSERT(arch == "llama");

View file

@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }

View file

@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }

View file

@ -0,0 +1,271 @@
# llama.cpp/example/llama-bench
Performance testing tool for llama.cpp.
## Table of contents
1. [Syntax](#syntax)
2. [Examples](#examples)
1. [Text generation with different models](#text-generation-with-different-models)
2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
3. [Different numbers of threads](#different-numbers-of-threads)
4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
3. [Output formats](#output-formats)
1. [Markdown](#markdown)
2. [CSV](#csv)
3. [JSON](#json)
4. [SQL](#sql)
## Syntax
```
usage: ./llama-bench [options]
options:
-h, --help
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512)
--memory-f32 <0|1> (default: 0)
-t, --threads <n> (default: 16)
-ngl N, --n-gpu-layers <n> (default: 99)
-mg i, --main-gpu <i> (default: 0)
-mmq, --mul-mat-q <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..>
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
llama-bench can perform two types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
For a description of the other options, see the [main example](../main/README.md).
## Examples
### Text generation with different models
```sh
$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
### Prompt processing with different batch sizes
```sh
$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
```
| model | size | params | backend | ngl | n_batch | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
### Different numbers of threads
```sh
$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
```
| model | size | params | backend | threads | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 ||
### Different numbers of layers offloaded to the GPU
```sh
$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
## Output formats
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
### Markdown
```sh
$ ./llama-bench -o md
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
### CSV
```sh
$ ./llama-bench -o csv
```
```csv
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
### JSON
```sh
$ ./llama-bench -o json
```
```json
[
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"main_gpu": 0,
"mul_mat_q": true,
"tensor_split": "0.00",
"n_prompt": 512,
"n_gen": 0,
"test_time": "2023-09-23T12:09:57Z",
"avg_ns": 212365953,
"stddev_ns": 985423,
"avg_ts": 2410.974041,
"stddev_ts": 11.163766,
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
},
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"main_gpu": 0,
"mul_mat_q": true,
"tensor_split": "0.00",
"n_prompt": 0,
"n_gen": 128,
"test_time": "2023-09-23T12:09:59Z",
"avg_ns": 977425219,
"stddev_ns": 9268593,
"avg_ts": 130.965708,
"stddev_ts": 1.238924,
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
}
]
```
### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
```sh
$ ./llama-bench -o sql
```
```sql
CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
opencl INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
cpu_info TEXT,
gpu_info TEXT,
model_filename TEXT,
model_type TEXT,
model_size INTEGER,
model_n_params INTEGER,
n_batch INTEGER,
n_threads INTEGER,
f16_kv INTEGER,
n_gpu_layers INTEGER,
main_gpu INTEGER,
mul_mat_q INTEGER,
tensor_split TEXT,
n_prompt INTEGER,
n_gen INTEGER,
test_time TEXT,
avg_ns INTEGER,
stddev_ns INTEGER,
avg_ts REAL,
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```

View file

@ -891,7 +891,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
int n_processed = 0; int n_processed = 0;
while (n_processed < n_prompt) { while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch); int n_tokens = std::min(n_prompt - n_processed, n_batch);
llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads); llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads);
n_processed += n_tokens; n_processed += n_tokens;
} }
} }
@ -899,11 +899,11 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(ctx); llama_token token = llama_token_bos(ctx);
for (int i = 0; i < n_gen; i++) { for (int i = 0; i < n_gen; i++) {
llama_eval(ctx, &token, 1, n_past + i, n_threads); llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads);
} }
} }
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) { static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
(void) level; (void) level;
(void) text; (void) text;
(void) user_data; (void) user_data;
@ -977,6 +977,8 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx); test t(inst, lmodel, ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
@ -986,6 +988,8 @@ int main(int argc, char ** argv) {
} }
for (int i = 0; i < params.reps; i++) { for (int i = 0; i < params.reps; i++) {
llama_kv_cache_tokens_rm(ctx, -1, -1);
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);

View file

@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); }); atexit([]() { console::cleanup(); });
if (params.perplexity) { if (params.logits_all) {
printf("\n************\n"); printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n"); printf("************\n\n");
@ -200,15 +200,6 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
// export the cgraph and exit
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");
llama_free(ctx);
llama_free_model(model);
return 0;
}
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
@ -508,18 +499,23 @@ int main(int argc, char ** argv) {
break; break;
} }
const int n_left = n_past - params.n_keep; const int n_left = n_past - params.n_keep - 1;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep); const int n_discard = n_left/2;
// always keep the first token - BOS LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past = std::max(1, params.n_keep); n_past, n_left, n_ctx, params.n_keep, n_discard);
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
// insert n_left/2 tokens at the start of embd from last_tokens
embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
LOG("clear session path\n"); LOG("clear session path\n");
@ -580,7 +576,7 @@ int main(int argc, char ** argv) {
for (int i = 0; i < input_size; i += params.n_batch) { for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch); int n_eval = std::min(input_size - i, params.n_batch);
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0), params.n_threads)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -597,7 +593,7 @@ int main(int argc, char ** argv) {
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0), params.n_threads)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }

View file

@ -1,22 +1,25 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
This script converts Hugging Face llama models to GGML and quantizes them. This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
Usage: Usage:
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments: Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. - model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used. - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used. - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'. - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created. - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Quant types: Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended - Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M - Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss - Q3_K_S: very small, very high quality loss
@ -40,9 +43,7 @@ import argparse
import os import os
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
def main(model, outname, outdir, quants, keep_fp16): def main(model, model_type, outname, outdir, quants, keep_fp16):
ggml_version = "v3"
if not os.path.isdir(model): if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...") print(f"Model not found at {model}. Downloading...")
try: try:
@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
print("Building llama.cpp") print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True) subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin" fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
print(f"Making unquantised GGML at {fp16}") print(f"Making unquantised GGUF at {fp16}")
if not os.path.isfile(fp16): if not os.path.isfile(fp16):
if model_type != "llama":
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
else:
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else: else:
print(f"Unquantised GGML already exists at: {fp16}") print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants") print("Making quants")
for type in quants: for type in quants:
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin" outfile = f"{outdir}/{outname}.gguf.{type}.bin"
print(f"Making {type} : {outfile}") print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True) subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
os.remove(fp16) os.remove(fp16)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name') parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
parser.add_argument('--outname', default=None, help='Output model(s) name') parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory') parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types') parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
@ -90,4 +95,4 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16) main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)

View file

@ -0,0 +1,8 @@
set(TARGET parallel)
add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,3 @@
# llama.cpp/example/parallel
Simplified simluation for serving incoming requests in parallel

View file

@ -0,0 +1,380 @@
// A basic application simulating a server with multiple clients.
// The clients submite requests to the server and they are processed in parallel.
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
// trim whitespace from the beginning and end of a string
static std::string trim(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && isspace(str[start])) {
start += 1;
}
while (end > start && isspace(str[end - 1])) {
end -= 1;
}
return str.substr(start, end - start);
}
static std::string k_system =
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
User: Recommend a nice restaurant in the area.
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
User: Who is Richard Feynman?
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
User:)";
static std::vector<std::string> k_prompts = {
"What is the meaning of life?",
"Tell me an interesting fact about llamas.",
"What is the best way to cook a steak?",
"Are you familiar with the Special Theory of Relativity and can you explain it to me?",
"Recommend some interesting books to read.",
"What is the best way to learn a new language?",
"How to get a job at Google?",
"If you could have any superpower, what would it be?",
"I want to learn how to play the piano.",
};
struct client {
int32_t id = 0;
llama_seq_id seq_id = -1;
llama_token sampled;
int64_t t_start_prompt;
int64_t t_start_gen;
int32_t n_prompt = 0;
int32_t n_decoded = 0;
int32_t i_batch = -1;
std::string input;
std::string prompt;
std::string response;
std::vector<llama_token> tokens_prev;
};
int main(int argc, char ** argv) {
srand(1234);
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
// requests to simulate
const int32_t n_seq = params.n_sequences;
// insert new requests as soon as the previous one is done
const bool cont_batching = params.cont_batching;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("parallel", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init(params.numa);
llama_model * model = NULL;
llama_context * ctx = NULL;
// load the target model
params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
fprintf(stderr, "\n\n");
fflush(stderr);
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(ctx);
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.tokens_prev.resize(std::max(256, params.n_predict));
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true);
const int32_t n_tokens_system = tokens_system.size();
llama_seq_id g_seq_id = 0;
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
llama_batch batch = llama_batch_init(params.n_ctx, 0);
int32_t n_total_prompt = 0;
int32_t n_total_gen = 0;
int32_t n_cache_miss = 0;
const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_TEE("\n");
{
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
batch.n_tokens = n_tokens_system;
for (int32_t i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = tokens_system[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (llama_decode(ctx, batch, params.n_threads) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i < n_clients; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
}
LOG_TEE("\n");
}
LOG_TEE("Processing requests ...\n\n");
while (true) {
batch.n_tokens = 0;
// decode any currently ongoing sequences
for (auto & client : clients) {
if (client.seq_id == -1) {
continue;
}
batch.token [batch.n_tokens] = client.sampled;
batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
batch.seq_id[batch.n_tokens] = client.id;
batch.logits[batch.n_tokens] = true;
client.n_decoded += 1;
client.i_batch = batch.n_tokens;
batch.n_tokens += 1;
}
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
for (int i = 0; i < n_clients; ++i) {
llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
}
// insert new sequences for decoding
if (cont_batching || batch.n_tokens == 0) {
for (auto & client : clients) {
if (client.seq_id == -1 && g_seq_id < n_seq) {
client.seq_id = g_seq_id;
client.t_start_prompt = ggml_time_us();
client.t_start_gen = 0;
client.input = k_prompts[rand() % k_prompts.size()];
client.prompt = client.input + "\nAssistant:";
client.response = "";
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
batch.token [batch.n_tokens] = tokens_prompt[i];
batch.pos [batch.n_tokens] = i + n_tokens_system;
batch.seq_id[batch.n_tokens] = client.id;
batch.logits[batch.n_tokens] = false;
batch.n_tokens += 1;
}
// extract the logits only for the last token
if (batch.n_tokens > 0) {
batch.logits[batch.n_tokens - 1] = true;
}
client.n_prompt = tokens_prompt.size();
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
g_seq_id += 1;
// insert new requests one-by-one
//if (cont_batching) {
// break;
//}
}
}
}
if (batch.n_tokens == 0) {
break;
}
// process in chunks of params.n_batch
int32_t n_batch = params.n_batch;
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
// experiment: process in powers of 2
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
// n_batch /= 2;
// i -= n_batch;
// continue;
//}
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view, params.n_threads);
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return 1;
}
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1;
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
continue;
}
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
continue;
}
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
// have their prompt already processed
client.t_start_gen = ggml_time_us();
}
// remember which tokens were sampled - used for repetition penalties during sampling
client.tokens_prev.erase(client.tokens_prev.begin());
client.tokens_prev.push_back(id);
const std::string token_str = llama_token_to_piece(ctx, id);
client.response += token_str;
client.sampled = id;
//printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 &&
(id == llama_token_eos(ctx) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) {
// basic reverse prompt
const size_t pos = client.response.find("User:");
if (pos != std::string::npos) {
client.response = client.response.substr(0, pos);
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
const auto t_main_end = ggml_time_us();
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
client.id, client.seq_id, client.n_prompt, client.n_decoded,
(t_main_end - client.t_start_prompt) / 1e6,
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
n_cache_miss,
::trim(client.input).c_str(),
::trim(client.response).c_str());
n_total_prompt += client.n_prompt;
n_total_gen += client.n_decoded;
client.seq_id = -1;
}
client.i_batch = -1;
}
}
}
const auto t_main_end = ggml_time_us();
LOG_TEE("\n\n");
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
LOG_TEE("\n\n");
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

View file

@ -1,3 +1,21 @@
# perplexity # perplexity
TODO TODO
## Llama 2 70B Scorechart
Quantization | Model size (GiB) | Perplexity | Delta to fp16
-- | -- | -- | --
Q4_0 | 36.20 | 3.5550 | 3.61%
Q4_1 | 40.20 | 3.5125 | 2.37%
Q5_0 | 44.20 | 3.4744 | 1.26%
Q2_K | 27.27 | 3.7339 | 8.82%
Q3_K_S | 27.86 | 3.7019 | 7.89%
Q3_K_M | 30.83 | 3.5932 | 4.72%
Q3_K_L | 33.67 | 3.5617 | 3.80%
Q4_K_S | 36.39 | 3.4852 | 1.57%
Q4_K_M | 38.54 | 3.4725 | 1.20%
Q5_K_S | 44.20 | 3.4483 | 0.50%
Q5_K_M | 45.41 | 3.4451 | 0.40%
Q6_K | 52.70 | 3.4367 | 0.16%
fp16 | 128.5 | 3.4313 | -

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -79,7 +80,9 @@ static void write_logfile(
static std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v); for (float v : logits) {
max_logit = std::max(max_logit, v);
}
double sum_exp = 0.0; double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) { for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability // Subtract the maximum logit value from the current logit value for numerical stability
@ -88,15 +91,21 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
sum_exp += exp_logit; sum_exp += exp_logit;
probs[i] = exp_logit; probs[i] = exp_logit;
} }
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; for (size_t i = 0; i < probs.size(); i++) {
probs[i] /= sum_exp;
}
return probs; return probs;
} }
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0]; float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
}
double sum_exp = 0.0; double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit); for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
@ -107,7 +116,8 @@ static void process_logits(
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
double local_nll = 0, local_nll2 = 0; double local_nll = 0;
double local_nll2 = 0;
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> lock(mutex);
int i = counter++; int i = counter++;
@ -125,10 +135,13 @@ static void process_logits(
prob_history[i] = results.prob; prob_history[i] = results.prob;
} }
}; };
for (auto & w : workers) w = std::thread(compute); for (auto & w : workers) {
w = std::thread(compute);
}
compute(); compute();
for (auto & w : workers) w.join(); for (auto & w : workers) {
w.join();
}
} }
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
@ -194,12 +207,15 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
for (int j = 0; j < num_batches; ++j) { for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch; const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch); const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
//fprintf(stderr, "%s : failed to eval\n", __func__); //fprintf(stderr, "%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
@ -319,6 +335,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
for (int j = 0; j < num_batches; ++j) { for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch; const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch); const int batch_size = std::min(end - batch_start, n_batch);
@ -331,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(ctx);
} }
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
@ -401,7 +420,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
} }
static std::vector<float> hellaswag_evaluate_tokens( static std::vector<float> hellaswag_evaluate_tokens(
llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab, int n_thread
) { ) {
std::vector<float> result; std::vector<float> result;
result.reserve(tokens.size() * n_vocab); result.reserve(tokens.size() * n_vocab);
@ -409,7 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) { for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
size_t n_tokens = tokens.size() - i_chunk * n_batch; size_t n_tokens = tokens.size() - i_chunk * n_batch;
n_tokens = std::min(n_tokens, size_t(n_batch)); n_tokens = std::min(n_tokens, size_t(n_batch));
if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0), n_thread)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return {}; return {};
} }
@ -549,6 +568,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
query_embd.resize(32); query_embd.resize(32);
} }
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads); auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
if (logits.empty()) { if (logits.empty()) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
@ -660,7 +682,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
params.perplexity = true; params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {

View file

@ -1,4 +1,5 @@
#define LLAMA_API_INTERNAL #define LLAMA_API_INTERNAL
#include "build-info.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,44 @@
# quantize # quantize
TODO TODO
## Llama 2 7B
Quantization | Bits per Weight (BPW)
-- | --
Q2_K | 3.35
Q3_K_S | 3.50
Q3_K_M | 3.91
Q3_K_L | 4.27
Q4_K_S | 4.58
Q4_K_M | 4.84
Q5_K_S | 5.52
Q5_K_M | 5.68
Q6_K | 6.56
## Llama 2 13B
Quantization | Bits per Weight (BPW)
-- | --
Q2_K | 3.34
Q3_K_S | 3.48
Q3_K_M | 3.89
Q3_K_L | 4.26
Q4_K_S | 4.56
Q4_K_M | 4.83
Q5_K_S | 5.51
Q5_K_M | 5.67
Q6_K | 6.56
# Llama 2 70B
Quantization | Bits per Weight (BPW)
-- | --
Q2_K | 3.40
Q3_K_S | 3.47
Q3_K_M | 3.85
Q3_K_L | 4.19
Q4_K_S | 4.53
Q4_K_M | 4.80
Q5_K_S | 5.50
Q5_K_M | 5.65
Q6_K | 6.56

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -34,11 +35,11 @@ int main(int argc, char ** argv) {
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0); auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
// init // init
auto model = llama_load_model_from_file(params.model.c_str(), lparams); auto * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == nullptr) { if (model == nullptr) {
return 1; return 1;
} }
auto ctx = llama_new_context_with_model(model, lparams); auto * ctx = llama_new_context_with_model(model, lparams);
if (ctx == nullptr) { if (ctx == nullptr) {
llama_free_model(model); llama_free_model(model);
return 1; return 1;
@ -53,7 +54,7 @@ int main(int argc, char ** argv) {
} }
// evaluate prompt // evaluate prompt
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0), params.n_threads);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens; n_past += n_prompt_tokens;
@ -77,7 +78,7 @@ int main(int argc, char ** argv) {
printf("\n%s", params.prompt.c_str()); printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) { for (auto i = 0; i < params.n_predict; i++) {
auto logits = llama_get_logits(ctx); auto * logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -90,7 +91,7 @@ int main(int argc, char ** argv) {
last_n_tokens_data.push_back(next_token); last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str()); printf("%s", next_token_str.c_str());
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__); fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
@ -105,7 +106,7 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
// make new context // make new context
auto ctx2 = llama_new_context_with_model(model, lparams); auto * ctx2 = llama_new_context_with_model(model, lparams);
// Load state (rng, logits, embedding and kv_cache) from file // Load state (rng, logits, embedding and kv_cache) from file
{ {
@ -137,7 +138,7 @@ int main(int argc, char ** argv) {
// second run // second run
for (auto i = 0; i < params.n_predict; i++) { for (auto i = 0; i < params.n_predict; i++) {
auto logits = llama_get_logits(ctx2); auto * logits = llama_get_logits(ctx2);
auto n_vocab = llama_n_vocab(ctx2); auto n_vocab = llama_n_vocab(ctx2);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -150,7 +151,7 @@ int main(int argc, char ** argv) {
last_n_tokens_data.push_back(next_token); last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str()); printf("%s", next_token_str.c_str());
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__); fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2); llama_free(ctx2);
llama_free_model(model); llama_free_model(model);

View file

@ -381,6 +381,10 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt // compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens); n_past = common_part(embd, prompt_tokens);
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
embd = prompt_tokens; embd = prompt_tokens;
if (n_past == num_prompt_tokens) if (n_past == num_prompt_tokens)
{ {
@ -411,19 +415,27 @@ struct llama_server_context
if (embd.size() >= (size_t)params.n_ctx) if (embd.size() >= (size_t)params.n_ctx)
{ {
// Reset context // Shift context
const int n_left = (params.n_ctx - params.n_keep) / 2;
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
{
embd[i - n_discard] = embd[i];
}
embd.resize(embd.size() - n_discard);
n_past -= n_discard;
std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
embd = new_tokens;
n_past = params.n_keep;
truncated = true; truncated = true;
LOG_VERBOSE("input truncated", { LOG_VERBOSE("input truncated", {
{"n_ctx", params.n_ctx}, {"n_ctx", params.n_ctx},
{"n_keep", params.n_keep}, {"n_keep", params.n_keep},
{"n_left", n_left}, {"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
}); });
} }
@ -434,7 +446,8 @@ struct llama_server_context
{ {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads))
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
{ {
LOG_ERROR("failed to eval", { LOG_ERROR("failed to eval", {
{"n_eval", n_eval}, {"n_eval", n_eval},
@ -523,13 +536,13 @@ struct llama_server_context
{ {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100; const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} }
else if (mirostat == 2) else if (mirostat == 2)
{ {
static float mirostat_mu = 2.0f * mirostat_tau; static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} }
else else
@ -540,7 +553,7 @@ struct llama_server_context
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep); llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep); llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep); llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
llama_sample_temperature(ctx, &candidates_p, temp); llama_sample_temp(ctx, &candidates_p, temp);
result.tok = llama_sample_token(ctx, &candidates_p); result.tok = llama_sample_token(ctx, &candidates_p);
} }
} }
@ -701,8 +714,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");

21
examples/simple/README.md Normal file
View file

@ -0,0 +1,21 @@
# llama.cpp/example/simple
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
```bash
./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
...
main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
llama_print_timings: load time = 579.15 ms
llama_print_timings: sample time = 0.72 ms / 28 runs ( 0.03 ms per token, 38888.89 tokens per second)
llama_print_timings: prompt eval time = 655.63 ms / 10 tokens ( 65.56 ms per token, 15.25 tokens per second)
llama_print_timings: eval time = 2180.97 ms / 27 runs ( 80.78 ms per token, 12.38 tokens per second)
llama_print_timings: total time = 2891.13 ms
```

View file

@ -26,12 +26,18 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
// total length of the sequence including the prompt
const int n_len = 32;
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if (model == NULL) { if (model == NULL) {
@ -41,20 +47,31 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true); tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4; const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
if ((int) tokens_list.size() > max_tokens_list_size) { LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1; return 1;
} }
fprintf(stderr, "\n\n"); // print the prompt token-by-token
fprintf(stderr, "\n");
for (auto id : tokens_list) { for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
@ -62,31 +79,41 @@ int main(int argc, char ** argv) {
fflush(stderr); fflush(stderr);
// main loop // create a llama_batch with size 512
// we use this object to submit token data for decoding
// The LLM keeps a contextual cache memory of previous token evaluation. llama_batch batch = llama_batch_init(512, 0);
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
const int n_gen = std::min(32, max_context_size); // evaluate the initial prompt
batch.n_tokens = tokens_list.size();
while (llama_get_kv_cache_token_count(ctx) < n_gen) { for (int32_t i = 0; i < batch.n_tokens; i++) {
// evaluate the transformer batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { // llama_decode will output logits only for the last token of the prompt
fprintf(stderr, "%s : failed to eval\n", __func__); batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch, params.n_threads) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
tokens_list.clear(); // main loop
int n_cur = batch.n_tokens;
int n_decode = 0;
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
// sample the next token // sample the next token
{
llama_token new_token_id = 0;
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -97,28 +124,59 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
new_token_id = llama_sample_token_greedy(ctx , &candidates_p); // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream?
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
LOG_TEE("\n");
// is it an end of stream ?
if (new_token_id == llama_token_eos(ctx)) {
fprintf(stderr, " [end of text]\n");
break; break;
} }
// print the new token : LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout); fflush(stdout);
// prepare the next batch
batch.n_tokens = 0;
// push this new token for next evaluation // push this new token for next evaluation
tokens_list.push_back(new_token_id); batch.token [batch.n_tokens] = new_token_id;
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = 0;
batch.logits[batch.n_tokens] = true;
batch.n_tokens += 1;
n_decode += 1;
} }
n_cur += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx);
fprintf(stderr, "\n");
llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n");
return 0; return 0;
} }

View file

@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
llama_context * ctx_dft = NULL; llama_context * ctx_dft = NULL;
// load the target model // load the target model
params.perplexity = true; // HACK: enable logits_all = true params.logits_all = true;
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
// load the draft model // load the draft model
@ -70,9 +70,9 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us(); const auto t_enc_start = ggml_time_us();
// eval the prompt with both models // eval the prompt with both models
llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads); llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0), params.n_threads);
llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads); llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0), params.n_threads);
llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads); llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0), params.n_threads);
const auto t_enc_end = ggml_time_us(); const auto t_enc_end = ggml_time_us();
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
// sample from the target model // sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
@ -172,7 +172,8 @@ int main(int argc, char ** argv) {
LOG("out of drafted tokens\n"); LOG("out of drafted tokens\n");
} }
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
++n_past_dft; ++n_past_dft;
// heuristic for n_draft // heuristic for n_draft
@ -256,7 +257,8 @@ int main(int argc, char ** argv) {
} }
// evaluate the drafted token on the draft model // evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
++n_past_cur; ++n_past_cur;
if (grammar_dft != NULL) { if (grammar_dft != NULL) {
@ -265,7 +267,8 @@ int main(int argc, char ** argv) {
} }
// evaluate the target model on the drafted tokens // evaluate the target model on the drafted tokens
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
++n_past_tgt; ++n_past_tgt;
// the first token is always proposed by the traget model before the speculation loop // the first token is always proposed by the traget model before the speculation loop

View file

@ -332,15 +332,23 @@ static struct ggml_tensor * llama_build_train_graphs(
} }
}; };
// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
(struct ggml_tensor * t) -> struct ggml_tensor * { (struct ggml_tensor * t) -> struct ggml_tensor * {
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int n_past = 0;
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_custom(ctx,
t, n_past, n_rot, rope_mode, n_ctx, t, KQ_pos, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale); rope_freq_base, rope_freq_scale);
}; };
@ -440,7 +448,10 @@ static struct ggml_tensor * llama_build_train_graphs(
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
// input gradient // input gradient
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
// KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad); ggml_allocr_alloc(alloc, t36->grad);
// allocating checkpoints in one block to reduce memory fragmentation // allocating checkpoints in one block to reduce memory fragmentation

View file

@ -35,6 +35,20 @@
); );
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ]; nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
cudatoolkit_joined = with pkgs; symlinkJoin {
# HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
# see https://github.com/NixOS/nixpkgs/issues/224291
# copied from jaxlib
name = "${cudaPackages.cudatoolkit.name}-merged";
paths = [
cudaPackages.cudatoolkit.lib
cudaPackages.cudatoolkit.out
] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
# for some reason some of the required libs are in the targets/x86_64-linux
# directory; not sure why but this works around it
"${cudaPackages.cudatoolkit}/targets/${system}"
];
};
llama-python = llama-python =
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
postPatch = '' postPatch = ''
@ -52,7 +66,8 @@
in in
{ {
packages.default = pkgs.stdenv.mkDerivation { packages.default = pkgs.stdenv.mkDerivation {
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall; inherit name src meta postPatch nativeBuildInputs postInstall;
buildInputs = osSpecific;
cmakeFlags = cmakeFlags cmakeFlags = cmakeFlags
++ (if isAarch64 && isDarwin then [ ++ (if isAarch64 && isDarwin then [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
@ -69,6 +84,13 @@
"-DLLAMA_CLBLAST=ON" "-DLLAMA_CLBLAST=ON"
]; ];
}; };
packages.cuda = pkgs.stdenv.mkDerivation {
inherit name src meta postPatch nativeBuildInputs postInstall;
buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
cmakeFlags = cmakeFlags ++ [
"-DLLAMA_CUBLAS=ON"
];
};
packages.rocm = pkgs.stdenv.mkDerivation { packages.rocm = pkgs.stdenv.mkDerivation {
inherit name src meta postPatch nativeBuildInputs postInstall; inherit name src meta postPatch nativeBuildInputs postInstall;
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];

View file

@ -14,9 +14,11 @@
// for rocblas_initialize() // for rocblas_initialize()
#include "rocblas/rocblas.h" #include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__ #endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@ -31,6 +33,9 @@
#define cublasSetStream hipblasSetStream #define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm #define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t #define cublasStatus_t hipblasStatus_t
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t #define cudaError_t hipError_t
@ -232,8 +237,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
} }
template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@ -424,6 +433,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif #endif
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
#define MUL_MAT_SRC1_COL_STRIDE 128 #define MUL_MAT_SRC1_COL_STRIDE 128
#define MAX_STREAMS 8 #define MAX_STREAMS 8
@ -1508,6 +1521,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
v.y = x[ib + iqs + 1]; v.y = x[ib + iqs + 1];
} }
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
const float * x = (const float *) vx;
// automatic half -> float type cast if dfloat == float
v.x = x[ib + iqs + 0];
v.y = x[ib + iqs + 1];
}
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
const int ix = blockDim.x*blockIdx.x + threadIdx.x; const int ix = blockDim.x*blockIdx.x + threadIdx.x;
@ -1547,8 +1568,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
reinterpret_cast<half&>(y[ib].ds.y) = sum; reinterpret_cast<half&>(y[ib].ds.y) = sum;
} }
template <int qk, int qr, dequantize_kernel_t dequantize_kernel> template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) { static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
if (i >= k) { if (i >= k) {
@ -4348,8 +4369,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
} }
// rope == RoPE == rotary positional embedding // rope == RoPE == rotary positional embedding
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale) { template<typename T, bool has_pos>
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
const int p_delta_rows, const float theta_scale) {
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (col >= ncols) { if (col >= ncols) {
@ -4358,8 +4381,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ncols + col; const int i = row*ncols + col;
const int i2 = row/p_delta_rows;
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); const int p = has_pos ? pos[i2] : 0;
const float p0 = p*freq_scale;
const float theta = p0*powf(theta_scale, col/2);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
@ -4370,8 +4396,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, template<typename T, bool has_pos>
const float p_delta, const int p_delta_rows, const float theta_scale) { static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
const int p_delta_rows, const float theta_scale) {
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (col >= ncols) { if (col >= ncols) {
@ -4380,8 +4407,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ncols + col/2; const int i = row*ncols + col/2;
const int i2 = row/p_delta_rows;
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); const int p = has_pos ? pos[i2] : 0;
const float p0 = p*freq_scale;
const float theta = p0*powf(theta_scale, col/2);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
@ -4392,8 +4422,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
} }
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0, static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) { const int p_delta_rows, const float theta_scale, const int n_ctx) {
const int col = blockDim.x*blockIdx.x + threadIdx.x; const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4; const int half_n_dims = ncols/4;
@ -4403,11 +4433,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
const int row = blockDim.y*blockIdx.y + threadIdx.y; const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col; const int i = row*ncols + col;
const int i2 = row/p_delta_rows;
const float col_theta_scale = powf(theta_scale, col); const float col_theta_scale = powf(theta_scale, col);
const float p = p0 + p_delta*(row/p_delta_rows); // FIXME: this is likely wrong
const int p = pos != nullptr ? pos[i2] : 0;
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale; const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
@ -4417,7 +4449,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale; const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
const float sin_block_theta = sinf(block_theta); const float sin_block_theta = sinf(block_theta);
const float cos_block_theta = cosf(block_theta); const float cos_block_theta = cosf(block_theta);
@ -4819,6 +4851,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
} }
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@ -4828,6 +4865,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_fp32_to_fp16_cuda;
default:
return nullptr;
}
}
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -5354,31 +5400,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k); scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
} }
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, template<typename T>
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(ncols % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nrows, num_blocks_x, 1); const dim3 block_nums(nrows, num_blocks_x, 1);
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); if (pos == nullptr) {
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
} else {
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
}
} }
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, template<typename T>
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(ncols % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nrows, num_blocks_x, 1); const dim3 block_nums(nrows, num_blocks_x, 1);
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); if (pos == nullptr) {
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
} else {
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
}
} }
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
GGML_ASSERT(ncols % 4 == 0); GGML_ASSERT(ncols % 4 == 0);
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
const dim3 block_nums(num_blocks_x, nrows, 1); const dim3 block_nums(num_blocks_x, nrows, 1);
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx); rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
} }
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@ -6009,8 +6065,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
GGML_ASSERT(src1_ddf_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr);
GGML_ASSERT(dst_dd_i != nullptr); GGML_ASSERT(dst_dd_i != nullptr);
const float alpha = 1.0f;
const float beta = 0.0f;
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
@ -6019,16 +6073,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
float * src0_ddq_as_f32;
size_t src0_as = 0;
if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
}
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
int id; int id;
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
@ -6036,6 +6080,61 @@ inline void ggml_cuda_op_mul_mat_cublas(
// ldc == nrows of the matrix that cuBLAS writes into // ldc == nrows of the matrix that cuBLAS writes into
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
const int compute_capability = g_compute_capabilities[id];
if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
// convert src1 to fp16, multiply as fp16, convert dst to fp32
half * src1_as_f16 = nullptr;
size_t src1_as = 0;
if (src1->type != GGML_TYPE_F16) {
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = src1_ncols*ne10;
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
}
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16, CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
ggml_cuda_pool_free(dst_f16, dst_as);
if (src1_as != 0) {
ggml_cuda_pool_free(src1_as_f16, src1_as);
}
}
else {
float * src0_ddq_as_f32 = nullptr;
size_t src0_as = 0;
if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32_cuda != nullptr);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
}
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
CUBLAS_CHECK( CUBLAS_CHECK(
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
@ -6044,9 +6143,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
src1_ddf_i, ne10, src1_ddf_i, ne10,
&beta, dst_dd_i, ldc)); &beta, dst_dd_i, ldc));
if (src0_as > 0) { if (src0_as != 0) {
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
} }
}
(void) dst; (void) dst;
(void) src1_ddq_i; (void) src1_ddq_i;
@ -6057,14 +6157,16 @@ inline void ggml_cuda_op_rope(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; const int n_ctx = ((int32_t *) dst->op_params)[3];
@ -6075,19 +6177,38 @@ inline void ggml_cuda_op_rope(
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
const int32_t * pos = nullptr;
if ((mode & 1) == 0) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->ne[0] == ne2);
pos = (const int32_t *) src1_dd;
}
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// compute // compute
if (is_glm) { if (is_glm) {
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream); GGML_ASSERT(false);
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
} else if (is_neox) { } else if (is_neox) {
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); if (src0->type == GGML_TYPE_F32) {
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
} else { } else {
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); GGML_ASSERT(false);
}
} else {
if (src0->type == GGML_TYPE_F32) {
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
} else {
GGML_ASSERT(false);
}
} }
(void) src1; (void) src1;
@ -6258,6 +6379,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
} }
} }
static void ggml_cuda_set_peer_access(const int n_tokens) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id));
for (int id_other = 0; id_other < g_device_count; ++id_other) {
if (id == id_other) {
continue;
}
if (id != g_main_device && id_other != g_main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
} else {
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
}
}
}
}
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
}
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
const bool convert_src1_to_q8_1) { const bool convert_src1_to_q8_1) {
@ -6282,6 +6440,8 @@ static void ggml_cuda_op_mul_mat(
const int nb2 = dst->nb[2]; const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3]; const int nb3 = dst->nb[3];
ggml_cuda_set_peer_access(ne11);
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
@ -6547,27 +6707,27 @@ static void ggml_cuda_op_mul_mat(
} }
} }
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
} }
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
} }
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
} }
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
} }
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
} }
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
} }
@ -6588,7 +6748,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
return false; return false;
} }
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@ -6617,7 +6777,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
} }
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(!ggml_is_permuted(src0));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@ -6651,7 +6811,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
} }
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
@ -6695,11 +6855,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
} }
} }
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
} }
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@ -6747,29 +6907,29 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(void) dst; (void) dst;
} }
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_cpy(src0, dst, nullptr); ggml_cuda_cpy(src0, dst, nullptr);
(void) src1; (void) src1;
} }
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
} }
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
} }
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
} }
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
} }
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
(void) src0; (void) src0;
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -6892,11 +7052,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
return extra; return extra;
} }
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
if (scratch && g_scratch_size == 0) { if (scratch && g_scratch_size == 0) {
return; return;
} }
tensor->backend = GGML_BACKEND_GPU;
// recursively assign CUDA buffers until a compute tensor is found // recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op; const ggml_op src0_op = tensor->src[0]->op;
@ -6908,8 +7070,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
} }
tensor->backend = GGML_BACKEND_GPU;
if (scratch && no_alloc) { if (scratch && no_alloc) {
return; return;
} }
@ -6970,6 +7130,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
return; return;
} }
if (g_scratch_buffer == nullptr) { if (g_scratch_buffer == nullptr) {
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
} }
@ -6993,6 +7154,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
tensor->extra = extra; tensor->extra = extra;
} }
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
GGML_ASSERT(ggml_is_contiguous(tensor));
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
}
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, true, false, false); ggml_cuda_assign_buffers_impl(tensor, true, false, false);
} }
@ -7009,7 +7179,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, true, false); ggml_cuda_assign_buffers_impl(tensor, false, true, false);
} }
void ggml_cuda_set_main_device(int main_device) { void ggml_cuda_set_main_device(const int main_device) {
if (main_device >= g_device_count) { if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device); main_device, g_device_count, g_main_device);
@ -7023,11 +7193,11 @@ void ggml_cuda_set_main_device(int main_device) {
} }
} }
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
g_mul_mat_q = mul_mat_q; g_mul_mat_q = mul_mat_q;
} }
void ggml_cuda_set_scratch_size(size_t scratch_size) { void ggml_cuda_set_scratch_size(const size_t scratch_size) {
g_scratch_size = scratch_size; g_scratch_size = scratch_size;
} }

View file

@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_set_main_device(int main_device); GGML_API void ggml_cuda_set_main_device(int main_device);
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);

View file

@ -19,6 +19,8 @@
#pragma once #pragma once
#include "ggml.h"
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
@ -33,6 +35,8 @@ struct ggml_cgraph;
extern "C" { extern "C" {
#endif #endif
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
struct ggml_metal_context; struct ggml_metal_context;
// number of command buffers to use // number of command buffers to use

View file

@ -11,11 +11,14 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
// TODO: temporary - reuse llama.cpp logging
#ifdef GGML_METAL_NDEBUG #ifdef GGML_METAL_NDEBUG
#define metal_printf(...) #define GGML_METAL_LOG_INFO(...)
#define GGML_METAL_LOG_WARN(...)
#define GGML_METAL_LOG_ERROR(...)
#else #else
#define metal_printf(...) fprintf(stderr, __VA_ARGS__) #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#endif #endif
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
@ -100,7 +103,8 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(rope_f32);
GGML_METAL_DECL_KERNEL(rope_f16);
GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16);
GGML_METAL_DECL_KERNEL(cpy_f32_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f32);
@ -120,8 +124,37 @@ static NSString * const msl_library_source = @"see metal.metal";
@implementation GGMLMetalClass @implementation GGMLMetalClass
@end @end
ggml_log_callback ggml_metal_log_callback = NULL;
void * ggml_metal_log_user_data = NULL;
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml_metal_log_callback = log_callback;
ggml_metal_log_user_data = user_data;
}
static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
if (ggml_metal_log_callback != NULL) {
va_list args;
va_start(args, format);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
} else {
char* buffer2 = malloc(len+1);
vsnprintf(buffer2, len+1, format, args);
buffer2[len] = 0;
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
free(buffer2);
}
va_end(args);
}
}
struct ggml_metal_context * ggml_metal_init(int n_cb) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__); GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
id <MTLDevice> device; id <MTLDevice> device;
NSString * s; NSString * s;
@ -131,14 +164,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
NSArray * devices = MTLCopyAllDevices(); NSArray * devices = MTLCopyAllDevices();
for (device in devices) { for (device in devices) {
s = [device name]; s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
} }
#endif #endif
// Pick and show default Metal device // Pick and show default Metal device
device = MTLCreateSystemDefaultDevice(); device = MTLCreateSystemDefaultDevice();
s = [device name]; s = [device name];
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
// Configure context // Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
@ -165,7 +198,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
} }
@ -179,11 +212,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
@ -195,7 +228,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
#endif #endif
if (error) { if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
} }
@ -207,11 +240,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \ (int) ctx->pipeline_##name.threadExecutionWidth); \
if (error) { \ if (error) { \
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \
} }
@ -261,7 +294,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(rope_f32);
GGML_METAL_ADD_KERNEL(rope_f16);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16);
GGML_METAL_ADD_KERNEL(cpy_f32_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f32);
@ -270,13 +304,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX #if TARGET_OS_OSX
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else { } else {
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
} }
#endif #endif
@ -284,7 +318,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
} }
void ggml_metal_free(struct ggml_metal_context * ctx) { void ggml_metal_free(struct ggml_metal_context * ctx) {
metal_printf("%s: deallocating\n", __func__); GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
#define GGML_METAL_DEL_KERNEL(name) \ #define GGML_METAL_DEL_KERNEL(name) \
[ctx->function_##name release]; \ [ctx->function_##name release]; \
[ctx->pipeline_##name release]; [ctx->pipeline_##name release];
@ -335,7 +369,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
GGML_METAL_DEL_KERNEL(rope); GGML_METAL_DEL_KERNEL(rope_f32);
GGML_METAL_DEL_KERNEL(rope_f16);
GGML_METAL_DEL_KERNEL(alibi_f32); GGML_METAL_DEL_KERNEL(alibi_f32);
GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f16);
GGML_METAL_DEL_KERNEL(cpy_f32_f32); GGML_METAL_DEL_KERNEL(cpy_f32_f32);
@ -360,7 +395,7 @@ void * ggml_metal_host_malloc(size_t n) {
void * data = NULL; void * data = NULL;
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) { if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__); GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
return NULL; return NULL;
} }
@ -388,7 +423,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
// //
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t); const int64_t tsize = ggml_nbytes(t);
@ -400,13 +435,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs; *offs = (size_t) ioffs;
//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
return ctx->buffers[i].metal; return ctx->buffers[i].metal;
} }
} }
metal_printf("%s: error: buffer is nil\n", __func__); GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
return nil; return nil;
} }
@ -418,7 +453,7 @@ bool ggml_metal_add_buffer(
size_t size, size_t size,
size_t max_size) { size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
metal_printf("%s: too many buffers\n", __func__); GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
return false; return false;
} }
@ -428,7 +463,7 @@ bool ggml_metal_add_buffer(
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
return false; return false;
} }
} }
@ -449,11 +484,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false; return false;
} }
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -473,13 +508,13 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false; return false;
} }
metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) { if (i + size_step < size) {
metal_printf("\n"); GGML_METAL_LOG_INFO("\n");
} }
++ctx->n_buffers; ++ctx->n_buffers;
@ -487,17 +522,17 @@ bool ggml_metal_add_buffer(
} }
#if TARGET_OS_OSX #if TARGET_OS_OSX
metal_printf(", (%8.2f / %8.2f)", GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
} else { } else {
metal_printf("\n"); GGML_METAL_LOG_INFO("\n");
} }
#else #else
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
#endif #endif
} }
@ -610,7 +645,7 @@ void ggml_metal_graph_find_concurrency(
} }
if (ctx->concur_list_len > GGML_MAX_CONCUR) { if (ctx->concur_list_len > GGML_MAX_CONCUR) {
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
} }
} }
@ -664,7 +699,7 @@ void ggml_metal_graph_compute(
continue; continue;
} }
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@ -708,17 +743,17 @@ void ggml_metal_graph_compute(
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src0) { //if (src0) {
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
// ggml_is_contiguous(src0), src0->name); // ggml_is_contiguous(src0), src0->name);
//} //}
//if (src1) { //if (src1) {
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
// ggml_is_contiguous(src1), src1->name); // ggml_is_contiguous(src1), src1->name);
//} //}
//if (dst) { //if (dst) {
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
// dst->name); // dst->name);
//} //}
@ -736,25 +771,59 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(src1));
// utilize float4 bool bcast_row = false;
GGML_ASSERT(ne00 % 4 == 0);
const int64_t nb = ne00/4;
if (ggml_nelements(src1) == ne10) { int64_t nb = ne00;
if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
// src1 is a row // src1 is a row
GGML_ASSERT(ne11 == 1); GGML_ASSERT(ne11 == 1);
nb = ne00 / 4;
[encoder setComputePipelineState:ctx->pipeline_add_row]; [encoder setComputePipelineState:ctx->pipeline_add_row];
bcast_row = true;
} else { } else {
[encoder setComputePipelineState:ctx->pipeline_add]; [encoder setComputePipelineState:ctx->pipeline_add];
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&nb length:sizeof(nb) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
if (bcast_row) {
const int64_t n = ggml_nelements(dst)/4; const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} else {
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
}
} break; } break;
case GGML_OP_MUL: case GGML_OP_MUL:
{ {
@ -830,13 +899,13 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
const int nth = 32; const int nth = MIN(32, ne00);
if (ne00%4 == 0) { if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4]; [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
@ -889,7 +958,7 @@ void ggml_metal_graph_compute(
src1t == GGML_TYPE_F32 && src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] && [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00%32 == 0 && ne00%32 == 0 &&
ne11 > 1) { ne11 > 2) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
@ -1019,7 +1088,7 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
metal_printf("Asserting on type %d\n",(int)src0t); GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
GGML_ASSERT(false && "not implemented"); GGML_ASSERT(false && "not implemented");
} }
}; };
@ -1100,7 +1169,7 @@ void ggml_metal_graph_compute(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
const int nth = 512; const int nth = MIN(512, ne00);
[encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1119,7 +1188,7 @@ void ggml_metal_graph_compute(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
const int nth = 256; const int nth = MIN(256, ne00);
[encoder setComputePipelineState:ctx->pipeline_norm]; [encoder setComputePipelineState:ctx->pipeline_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1137,6 +1206,8 @@ void ggml_metal_graph_compute(
{ {
GGML_ASSERT((src0t == GGML_TYPE_F32)); GGML_ASSERT((src0t == GGML_TYPE_F32));
const int nth = MIN(1024, ne00);
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
const int n_head = ((int32_t *) dst->op_params)[1]; const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias; float max_bias;
@ -1170,12 +1241,14 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
const int nth = 32;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
GGML_ASSERT(ne10 == ne02);
const int nth = MIN(1024, ne00);
const int n_past = ((int32_t *) dst->op_params)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
@ -1185,38 +1258,44 @@ void ggml_metal_graph_compute(
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_rope]; switch (src0->type) {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; default: GGML_ASSERT(false);
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; };
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
{ {
const int nth = 32; const int nth = MIN(1024, ne00);
switch (src0t) { switch (src0t) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
@ -1261,7 +1340,7 @@ void ggml_metal_graph_compute(
} break; } break;
default: default:
{ {
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
@ -1286,7 +1365,7 @@ void ggml_metal_graph_compute(
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }

View file

@ -24,12 +24,59 @@ typedef struct {
int8_t qs[QK8_0]; // quants int8_t qs[QK8_0]; // quants
} block_q8_0; } block_q8_0;
// general-purpose kernel for addition of two tensors
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
// cons: not very efficient
kernel void kernel_add( kernel void kernel_add(
device const float4 * src0, device const char * src0,
device const float4 * src1, device const char * src1,
device float4 * dst, device char * dst,
uint tpig[[thread_position_in_grid]]) { constant int64_t & ne00,
dst[tpig] = src0[tpig] + src1[tpig]; constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & nb00,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant int64_t & nb0,
constant int64_t & nb1,
constant int64_t & nb2,
constant int64_t & nb3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig.z;
const int64_t i02 = tgpig.y;
const int64_t i01 = tgpig.x;
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
src0_ptr += ntg.x*nb00;
src1_ptr += ntg.x*nb10;
dst_ptr += ntg.x*nb0;
}
} }
// assumption: src1 is a row // assumption: src1 is a row
@ -38,7 +85,7 @@ kernel void kernel_add_row(
device const float4 * src0, device const float4 * src0,
device const float4 * src1, device const float4 * src1,
device float4 * dst, device float4 * dst,
constant int64_t & nb, constant int64_t & nb [[buffer(27)]],
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % nb]; dst[tpig] = src0[tpig] + src1[tpig % nb];
} }
@ -806,8 +853,39 @@ kernel void kernel_alibi_f32(
} }
} }
typedef void (rope_t)(
device const void * src0,
device const int32_t * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int & n_past,
constant int & n_dims,
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]);
template<typename T>
kernel void kernel_rope( kernel void kernel_rope(
device const void * src0, device const void * src0,
device const int32_t * src1,
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -839,7 +917,9 @@ kernel void kernel_rope(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); device const int32_t * pos = src1;
const int64_t p = pos[i2];
const float theta_0 = freq_scale * (float)p; const float theta_0 = freq_scale * (float)p;
const float inv_ndims = -1.f/n_dims; const float inv_ndims = -1.f/n_dims;
@ -851,11 +931,11 @@ kernel void kernel_rope(
const float cos_theta = cos(theta); const float cos_theta = cos(theta);
const float sin_theta = sin(theta); const float sin_theta = sin(theta);
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0]; const T x0 = src[0];
const float x1 = src[1]; const T x1 = src[1];
dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[1] = x0*sin_theta + x1*cos_theta; dst_data[1] = x0*sin_theta + x1*cos_theta;
@ -870,8 +950,8 @@ kernel void kernel_rope(
const int64_t i0 = ib*n_dims + ic/2; const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0]; const float x0 = src[0];
const float x1 = src[n_dims/2]; const float x1 = src[n_dims/2];
@ -883,6 +963,9 @@ kernel void kernel_rope(
} }
} }
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
kernel void kernel_cpy_f16_f16( kernel void kernel_cpy_f16_f16(
device const half * src0, device const half * src0,
device half * dst, device half * dst,
@ -1273,8 +1356,8 @@ kernel void kernel_mul_mat_q3_K_f32(
float yl[32]; float yl[32];
const uint16_t kmask1 = 0x3030; //const uint16_t kmask1 = 0x3030;
const uint16_t kmask2 = 0x0f0f; //const uint16_t kmask2 = 0x0f0f;
const int tid = tiisg/4; const int tid = tiisg/4;
const int ix = tiisg%4; const int ix = tiisg%4;

View file

@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float" "mul_f32", "float"
}; };
std::string& replace(std::string& s, const std::string& from, const std::string& to) { static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) { while ((pos = s.find(from, pos)) != std::string::npos) {
s.replace(pos, from.length(), to); s.replace(pos, from.length(), to);
@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
return s; return s;
} }
std::string generate_kernels() { static std::string generate_kernels() {
std::stringstream src; std::stringstream src;
src << program_source << '\n'; src << program_source << '\n';
src << k_quants_source << '\n'; src << k_quants_source << '\n';
@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
return false; return false;
} }
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
// If device doesn't support FP16 // If device doesn't support FP16
if (!fp16_support) { if (!fp16_support) {
return false; return false;

168
ggml.c
View file

@ -6665,6 +6665,54 @@ struct ggml_tensor * ggml_cont_inplace(
return ggml_cont_impl(ctx, a, true); return ggml_cont_impl(ctx, a, true);
} }
// make contiguous, with new shape
GGML_API struct ggml_tensor * ggml_cont_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0) {
return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
}
GGML_API struct ggml_tensor * ggml_cont_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1) {
return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
}
GGML_API struct ggml_tensor * ggml_cont_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
}
struct ggml_tensor * ggml_cont_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3) {
GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
bool is_node = false;
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
ggml_format_name(result, "%s (cont)", a->name);
result->op = GGML_OP_CONT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_reshape // ggml_reshape
struct ggml_tensor * ggml_reshape( struct ggml_tensor * ggml_reshape(
@ -7226,7 +7274,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
static struct ggml_tensor * ggml_rope_impl( static struct ggml_tensor * ggml_rope_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -7235,7 +7283,10 @@ static struct ggml_tensor * ggml_rope_impl(
float xpos_base, float xpos_base,
bool xpos_down, bool xpos_down,
bool inplace) { bool inplace) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]);
bool is_node = false; bool is_node = false;
if (a->grad) { if (a->grad) {
@ -7244,7 +7295,7 @@ static struct ggml_tensor * ggml_rope_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[8] = { n_past, n_dims, mode, n_ctx }; int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float));
memcpy(params + 6, &xpos_base, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float));
@ -7254,6 +7305,7 @@ static struct ggml_tensor * ggml_rope_impl(
result->op = GGML_OP_ROPE; result->op = GGML_OP_ROPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = b;
return result; return result;
} }
@ -7261,55 +7313,55 @@ static struct ggml_tensor * ggml_rope_impl(
struct ggml_tensor * ggml_rope( struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
} }
struct ggml_tensor * ggml_rope_inplace( struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
} }
struct ggml_tensor * ggml_rope_custom( struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
float freq_base, float freq_base,
float freq_scale) { float freq_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
} }
struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
float freq_base, float freq_base,
float freq_scale) { float freq_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
} }
struct ggml_tensor * ggml_rope_xpos_inplace( struct ggml_tensor * ggml_rope_xpos_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
float base, float base,
bool down) { bool down) {
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
} }
// ggml_rope_back // ggml_rope_back
@ -7317,7 +7369,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
struct ggml_tensor * ggml_rope_back( struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -7325,7 +7377,10 @@ struct ggml_tensor * ggml_rope_back(
float freq_scale, float freq_scale,
float xpos_base, float xpos_base,
bool xpos_down) { bool xpos_down) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]);
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
bool is_node = false; bool is_node = false;
@ -7336,7 +7391,7 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[8] = { n_past, n_dims, mode, n_ctx }; int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float));
memcpy(params + 6, &xpos_base, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float));
@ -7346,6 +7401,7 @@ struct ggml_tensor * ggml_rope_back(
result->op = GGML_OP_ROPE_BACK; result->op = GGML_OP_ROPE_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = b;
return result; return result;
} }
@ -9068,8 +9124,6 @@ static void ggml_compute_forward_add_f32(
#else #else
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
#endif #endif
// }
// }
} }
} else { } else {
// src1 is not contiguous // src1 is not contiguous
@ -12920,13 +12974,11 @@ static void ggml_compute_forward_alibi_f16(
return; return;
} }
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1]; const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias; float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
assert(n_past >= 0);
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne2 = src0->ne[2]; // n_head -> this is k const int ne2 = src0->ne[2]; // n_head -> this is k
@ -12941,7 +12993,7 @@ static void ggml_compute_forward_alibi_f16(
//const int nb3 = src0->nb[3]; //const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(n_head == ne2); GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled) // add alibi to src0 (KQ_scaled)
@ -13087,8 +13139,8 @@ static void ggml_compute_forward_clamp(
static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -13100,7 +13152,7 @@ static void ggml_compute_forward_rope_f32(
float xpos_base; float xpos_base;
bool xpos_down; bool xpos_down;
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; const int n_ctx = ((int32_t *) dst->op_params)[3];
@ -13109,8 +13161,6 @@ static void ggml_compute_forward_rope_f32(
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
assert(n_past >= 0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@ -13141,9 +13191,11 @@ static void ggml_compute_forward_rope_f32(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
@ -13180,7 +13232,7 @@ static void ggml_compute_forward_rope_f32(
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
// zeta scaling for xPos only: // zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
if (xpos_down) zeta = 1.0f / zeta; if (xpos_down) zeta = 1.0f / zeta;
theta *= theta_scale; theta *= theta_scale;
@ -13225,8 +13277,8 @@ static void ggml_compute_forward_rope_f32(
static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope_f16(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -13234,15 +13286,13 @@ static void ggml_compute_forward_rope_f16(
float freq_base; float freq_base;
float freq_scale; float freq_scale;
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; const int n_ctx = ((int32_t *) dst->op_params)[3];
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
assert(n_past >= 0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@ -13273,9 +13323,11 @@ static void ggml_compute_forward_rope_f16(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
@ -13354,15 +13406,16 @@ static void ggml_compute_forward_rope_f16(
static void ggml_compute_forward_rope( static void ggml_compute_forward_rope(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_f16(params, src0, dst); ggml_compute_forward_rope_f16(params, src0, src1, dst);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_f32(params, src0, dst); ggml_compute_forward_rope_f32(params, src0, src1, dst);
} break; } break;
default: default:
{ {
@ -13376,6 +13429,7 @@ static void ggml_compute_forward_rope(
static void ggml_compute_forward_rope_back_f32( static void ggml_compute_forward_rope_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@ -13393,7 +13447,7 @@ static void ggml_compute_forward_rope_back_f32(
float xpos_base; float xpos_base;
bool xpos_down; bool xpos_down;
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
@ -13402,8 +13456,6 @@ static void ggml_compute_forward_rope_back_f32(
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
assert(n_past >= 0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@ -13430,9 +13482,11 @@ static void ggml_compute_forward_rope_back_f32(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
@ -13444,7 +13498,7 @@ static void ggml_compute_forward_rope_back_f32(
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
// zeta scaling for xPos only: // zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
if (xpos_down) zeta = 1.0f / zeta; if (xpos_down) zeta = 1.0f / zeta;
theta *= theta_scale; theta *= theta_scale;
@ -13487,6 +13541,7 @@ static void ggml_compute_forward_rope_back_f32(
static void ggml_compute_forward_rope_back_f16( static void ggml_compute_forward_rope_back_f16(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@ -13497,12 +13552,10 @@ static void ggml_compute_forward_rope_back_f16(
// dx = rope_back(dy, src1) // dx = rope_back(dy, src1)
// src0 is dy, src1 contains options // src0 is dy, src1 contains options
const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
assert(n_past >= 0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@ -13529,9 +13582,11 @@ static void ggml_compute_forward_rope_back_f16(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
@ -13583,15 +13638,16 @@ static void ggml_compute_forward_rope_back_f16(
static void ggml_compute_forward_rope_back( static void ggml_compute_forward_rope_back(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_back_f16(params, src0, dst); ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_back_f32(params, src0, dst); ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
} break; } break;
default: default:
{ {
@ -16349,11 +16405,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
ggml_compute_forward_rope(params, tensor->src[0], tensor); ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
} break; } break;
case GGML_OP_ROPE_BACK: case GGML_OP_ROPE_BACK:
{ {
ggml_compute_forward_rope_back(params, tensor->src[0], tensor); ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
} break; } break;
case GGML_OP_ALIBI: case GGML_OP_ALIBI:
{ {
@ -17220,7 +17276,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
// necessary for llama // necessary for llama
if (src0->grad) { if (src0->grad) {
const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
@ -17237,7 +17293,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad, src0->grad,
ggml_rope_back(ctx, ggml_rope_back(ctx,
tensor->grad, tensor->grad,
n_past, src1,
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,
@ -17251,7 +17307,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
case GGML_OP_ROPE_BACK: case GGML_OP_ROPE_BACK:
{ {
if (src0->grad) { if (src0->grad) {
const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
@ -17268,7 +17324,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad, src0->grad,
ggml_rope_impl(ctx, ggml_rope_impl(ctx,
tensor->grad, tensor->grad,
n_past, src1,
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,

52
ggml.h
View file

@ -445,6 +445,12 @@ extern "C" {
GGML_OBJECT_WORK_BUFFER GGML_OBJECT_WORK_BUFFER
}; };
enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4
};
// ggml object // ggml object
struct ggml_object { struct ggml_object {
size_t offs; size_t offs;
@ -1075,7 +1081,6 @@ extern "C" {
size_t nb1, size_t nb1,
size_t offset); size_t offset);
// a -> b, return view(b) // a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy( GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1098,6 +1103,33 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// make contiguous, with new shape
GGML_API struct ggml_tensor * ggml_cont_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0);
GGML_API struct ggml_tensor * ggml_cont_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1);
GGML_API struct ggml_tensor * ggml_cont_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2);
GGML_API struct ggml_tensor * ggml_cont_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape( GGML_API struct ggml_tensor * ggml_reshape(
@ -1245,14 +1277,15 @@ extern "C" {
struct ggml_tensor * b); struct ggml_tensor * b);
// rotary position embedding // rotary position embedding
// if mode & 1 == 1, skip n_past elements // if mode & 1 == 1, skip n_past elements (DEPRECATED)
// if mode & 2 == 1, GPT-NeoX style // if mode & 2 == 1, GPT-NeoX style
// if mode & 4 == 1, ChatGLM style // if mode & 4 == 1, ChatGLM style
// TODO: avoid creating a new tensor every time //
// b is an int32 vector with size a->ne[2], it contains the positions
GGML_API struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx); int n_ctx);
@ -1261,7 +1294,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rope_inplace( GGML_API struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx); int n_ctx);
@ -1270,7 +1303,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rope_custom( GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -1281,7 +1314,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rope_custom_inplace( GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -1292,7 +1325,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
float base, float base,
bool down); bool down);
@ -1302,7 +1335,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rope_back( GGML_API struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, struct ggml_tensor * b,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -1727,6 +1760,7 @@ extern "C" {
}; };
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
// optimization parameters // optimization parameters
// //

View file

@ -32,7 +32,7 @@ KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url" KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository"
KEY_GENERAL_FILE_TYPE = "general.file_type" KEY_GENERAL_FILE_TYPE = "general.file_type"
# LLM # LLM

1076
llama.cpp

File diff suppressed because it is too large Load diff

318
llama.h
View file

@ -37,6 +37,8 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
@ -60,13 +62,9 @@ extern "C" {
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
typedef int llama_token; typedef int32_t llama_pos;
typedef int32_t llama_token;
enum llama_log_level { typedef int32_t llama_seq_id;
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
enum llama_vocab_type { enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@ -95,15 +93,15 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };
@ -122,6 +120,35 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef void (*llama_progress_callback)(float progress, void *ctx);
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
//
// - token : the token ids of the input (used when embd is NULL)
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits for the respective token will not be output
//
typedef struct llama_batch {
int32_t n_tokens;
llama_token * token;
float * embd;
llama_pos * pos;
llama_seq_id * seq_id;
int8_t * logits;
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
//
// pos[i] = all_pos_0 + i*all_pos_1
//
llama_pos all_pos_0; // used if pos == NULL
llama_pos all_pos_1; // used if pos == NULL
llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch;
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context int32_t n_ctx; // text context
@ -151,13 +178,6 @@ extern "C" {
bool embedding; // embedding mode only bool embedding; // embedding mode only
}; };
// Signature for logging events
// Note that text includes the new line character at the end for most events.
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists.
// It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params {
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
@ -215,6 +235,7 @@ extern "C" {
int32_t n_eval; int32_t n_eval;
}; };
// Helpers for getting default parameters
LLAMA_API struct llama_context_params llama_context_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
@ -259,8 +280,10 @@ extern "C" {
// Get a string describing the model type // Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
// Returns the total size of all the tensors in the model in bytes // Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model); LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Returns the total number of parameters in the model // Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
@ -285,7 +308,7 @@ extern "C" {
float scale, float scale,
const char * path_base_model, const char * path_base_model,
int n_threads), int n_threads),
"please use llama_model_apply_lora_from_file instead"); "use llama_model_apply_lora_from_file instead");
LLAMA_API int llama_model_apply_lora_from_file( LLAMA_API int llama_model_apply_lora_from_file(
const struct llama_model * model, const struct llama_model * model,
@ -294,11 +317,53 @@ extern "C" {
const char * path_base_model, const char * path_base_model,
int n_threads); int n_threads);
// Returns the number of tokens in the KV cache //
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); // KV cache
//
// Sets the current rng seed. // Returns the number of tokens in the KV cache
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
// Remove all tokens data of cells in [c0, c1)
LLAMA_API void llama_kv_cache_tokens_rm(
struct llama_context * ctx,
int32_t c0,
int32_t c1);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
LLAMA_API void llama_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1);
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
LLAMA_API void llama_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1);
// Removes all tokens that do not belong to the specified sequence
LLAMA_API void llama_kv_cache_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id);
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly
LLAMA_API void llama_kv_cache_seq_shift(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta);
//
// State / sessions
//
// Returns the maximum size in bytes of the state (rng, logits, embedding // Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens // and kv_cache) - will often be smaller after compacting tokens
@ -307,48 +372,100 @@ extern "C" {
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
// Destination needs to have allocated enough memory. // Destination needs to have allocated enough memory.
// Returns the number of bytes copied // Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); LLAMA_API size_t llama_copy_state_data(
struct llama_context * ctx,
uint8_t * dst);
// Set the state reading from the specified address // Set the state reading from the specified address
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); LLAMA_API size_t llama_set_state_data(
struct llama_context * ctx,
uint8_t * src);
// Save/load session file // Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); LLAMA_API bool llama_load_session_file(
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); struct llama_context * ctx,
const char * path_session,
llama_token * tokens_out,
size_t n_token_capacity,
size_t * n_token_count_out);
// Run the llama inference to obtain the logits and probabilities for the next token. LLAMA_API bool llama_save_session_file(
struct llama_context * ctx,
const char * path_session,
const llama_token * tokens,
size_t n_token_count);
//
// Decoding
//
// Run the llama inference to obtain the logits and probabilities for the next token(s).
// tokens + n_tokens is the provided batch of new tokens to process // tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls // n_past is the number of tokens to use from previous eval calls
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_eval( // DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval(
struct llama_context * ctx, struct llama_context * ctx,
const llama_token * tokens, llama_token * tokens,
int n_tokens, int32_t n_tokens,
int n_past, int n_past,
int n_threads); int n_threads),
"use llama_decode() instead");
// Same as llama_eval, but use float matrix input directly. // Same as llama_eval, but use float matrix input directly.
LLAMA_API int llama_eval_embd( // DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval_embd(
struct llama_context * ctx, struct llama_context * ctx,
const float * embd, float * embd,
int n_tokens, int32_t n_tokens,
int n_past, int n_past,
int n_threads); int n_threads),
"use llama_decode() instead");
// Export a static computation graph for context of 511 and batch size of 1 // Return batch for single sequence of tokens starting at pos_0
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these //
// parameters here to keep things simple // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
// IMPORTANT: do not use for anything else other than debugging and testing! //
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); LLAMA_API struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id);
// Allocates a batch of tokens on the heap
// The batch has to be freed with llama_batch_free()
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
// The rest of the llama_batch members are allocated with size n_tokens
// All members are left uninitialized
LLAMA_API struct llama_batch llama_batch_init(
int32_t n_tokens,
int32_t embd);
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
// < 0 - error
LLAMA_API int llama_decode(
struct llama_context * ctx,
struct llama_batch batch,
int n_threads);
// Token logits obtained from the last call to llama_eval() // Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token // Logits for which llama_batch.logits[i] == 0 are undefined
// Rows: n_tokens // Rows: n_tokens provided with llama_batch
// Cols: n_vocab // Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx); LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Logits for the ith token. Equivalent to:
// llama_get_logits(ctx) + i*n_vocab
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for the input // Get the embeddings for the input
// shape: [n_embd] (1-dimensional) // shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ -425,11 +542,25 @@ extern "C" {
// Sampling functions // Sampling functions
// //
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); LLAMA_API void llama_sample_repetition_penalty(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t last_tokens_size,
float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); LLAMA_API void llama_sample_frequency_and_presence_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t last_tokens_size,
float alpha_frequency,
float alpha_presence);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@ -442,23 +573,54 @@ extern "C" {
float scale); float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API void llama_sample_softmax(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); LLAMA_API void llama_sample_top_k(
struct llama_context * ctx,
llama_token_data_array * candidates,
int k,
size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_top_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,
llama_token_data_array * candidates,
float z,
size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_typical(
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp);
LLAMA_API DEPRECATED(void llama_sample_temperature(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp),
"use llama_sample_temp instead");
/// @details Apply constraints from grammar /// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar); LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -466,23 +628,41 @@ extern "C" {
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); LLAMA_API llama_token llama_sample_token_mirostat(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
int m,
float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); LLAMA_API llama_token llama_sample_token_mirostat_v2(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
float * mu);
/// @details Selects the token with the highest probability. /// @details Selects the token with the highest probability.
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API llama_token llama_sample_token_greedy(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities. /// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API llama_token llama_sample_token(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar /// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);
// //
// Beam search // Beam search
@ -490,6 +670,7 @@ extern "C" {
struct llama_beam_view { struct llama_beam_view {
const llama_token * tokens; const llama_token * tokens;
size_t n_tokens; size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams) float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam. bool eob; // Callback should set this to true when a beam is at end-of-beam.
@ -501,6 +682,7 @@ extern "C" {
// These pointers are valid only during the synchronous callback, so should not be saved. // These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state { struct llama_beams_state {
struct llama_beam_view * beam_views; struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[]. size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation. bool last_call; // True iff this is the last callback invocation.
@ -519,10 +701,18 @@ extern "C" {
/// @param n_past Number of tokens already evaluated. /// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
/// @param n_threads Number of threads as passed to llama_eval(). /// @param n_threads Number of threads as passed to llama_eval().
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads); LLAMA_API void llama_beam_search(
struct llama_context * ctx,
llama_beam_search_callback_fn_t callback,
void * callback_data,
size_t n_beams,
int n_past,
int n_predict,
int n_threads);
// Performance information // Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx); LLAMA_API void llama_reset_timings(struct llama_context * ctx);
@ -531,7 +721,7 @@ extern "C" {
// Set callback for all future logging events. // Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr. // If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

View file

@ -8,16 +8,12 @@ set(BUILD_TARGET "unknown")
# Look for git # Look for git
find_package(Git) find_package(Git)
if(NOT Git_FOUND) if(NOT Git_FOUND)
execute_process( find_program(GIT_EXECUTABLE NAMES git git.exe)
COMMAND which git if(GIT_EXECUTABLE)
OUTPUT_VARIABLE GIT_EXECUTABLE
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(NOT GIT_EXECUTABLE STREQUAL "")
set(Git_FOUND TRUE) set(Git_FOUND TRUE)
message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}") message(STATUS "Found Git: ${GIT_EXECUTABLE}")
else() else()
message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.") message(WARNING "Git not found. Build info will not be accurate.")
endif() endif()
endif() endif()
@ -28,43 +24,32 @@ if(Git_FOUND)
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE HEAD OUTPUT_VARIABLE HEAD
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE GIT_HEAD_RESULT
) )
execute_process( execute_process(
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE COUNT OUTPUT_VARIABLE COUNT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE GIT_COUNT_RESULT
) )
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
set(BUILD_COMMIT ${HEAD})
set(BUILD_NUMBER ${COUNT})
endif()
endif()
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
set(BUILD_COMMIT ${HEAD}) set(BUILD_COMMIT ${HEAD})
set(BUILD_NUMBER ${COUNT}) set(BUILD_NUMBER ${COUNT})
endif() endif()
execute_process( if(MSVC)
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else()
execute_process(
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER} COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES )
)
if (RES EQUAL 0)
set(BUILD_COMPILER ${OUT}) set(BUILD_COMPILER ${OUT})
endif() execute_process(
execute_process(
COMMAND ${CMAKE_C_COMPILER} -dumpmachine COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OUT OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES )
)
if (RES EQUAL 0)
set(BUILD_TARGET ${OUT}) set(BUILD_TARGET ${OUT})
endif() endif()

View file

@ -1,4 +1,4 @@
#!/bin/env python3 #!/usr/bin/env python3
import os import os
import hashlib import hashlib

View file

@ -37,6 +37,8 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-grad0.cpp) # SLOW
# llama_build_and_test_executable(test-opt.cpp) # SLOW # llama_build_and_test_executable(test-opt.cpp) # SLOW
llama_build_and_test_executable(test-rope.cpp)
# dummy executable - not installed # dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE) get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c) add_executable(${TEST_TARGET} test-c.c)

View file

@ -1471,6 +1471,11 @@ int main(int argc, const char ** argv) {
for (int n_past = 1; n_past < ne2[2]; ++n_past) { for (int n_past = 1; n_past < ne2[2]; ++n_past) {
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
for (int i = 0; i < ne2[2]; ++i) {
((int32_t *) p->data)[i] = n_past + i;
}
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
const bool skip_past = (mode & 1); const bool skip_past = (mode & 1);
@ -1482,7 +1487,7 @@ int main(int argc, const char ** argv) {
continue; continue;
} }
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY); check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
@ -1506,6 +1511,11 @@ int main(int argc, const char ** argv) {
for (int n_past = 1; n_past < ne2[2]; ++n_past) { for (int n_past = 1; n_past < ne2[2]; ++n_past) {
x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
for (int i = 0; i < ne2[2]; ++i) {
((int32_t *) p->data)[i] = n_past + i;
}
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
const bool skip_past = (mode & 1); const bool skip_past = (mode & 1);
@ -1517,7 +1527,7 @@ int main(int argc, const char ** argv) {
continue; continue;
} }
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY); check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);

221
tests/test-rope.cpp Normal file
View file

@ -0,0 +1,221 @@
#include "ggml.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
#define MAX_NARGS 3
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_SILU_FP16
//
// logging
//
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
#define GGML_PRINT(...) printf(__VA_ARGS__)
static float frand(void) {
return (float)rand()/(float)RAND_MAX;
}
static int irand(int n) {
if (n == 0) return 0;
return rand()%n;
}
static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) {
dims[i] = 1 + irand(4);
}
}
static struct ggml_tensor * get_random_tensor_f32(
struct ggml_context * ctx0,
int ndims,
const int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) {
case 1:
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
}
break;
case 2:
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
break;
case 3:
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
break;
case 4:
for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
}
break;
default:
assert(false);
};
return result;
}
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_init_params params = {
/* .mem_size = */ 128*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
std::vector<uint8_t> work_buffer;
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_tensor * x;
// rope f32
for (int m = 0; m < 3; ++m) {
const int ndims = 4;
const int64_t n_rot = 128;
const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
const int n_past_0 = 100;
const int n_past_2 = 33;
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
for (int i = 0; i < ne[2]; ++i) {
((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
// 100, 101, 102, ..., 172
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode, 1024);
// -67, -67, -67, ..., -67
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
// 33, 34, 35, ..., 105
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode, 1024);
ggml_cgraph * gf = ggml_new_graph(ctx0);
ggml_build_forward_expand(gf, r0);
ggml_build_forward_expand(gf, r1);
ggml_build_forward_expand(gf, r2);
ggml_graph_compute_helper(work_buffer, gf, 4);
// check that r1 and r2 are the same
{
double sum0 = 0.0f;
double sum1 = 0.0f;
double diff = 0.0f;
const float * r1_data = (float *) r1->data;
const float * r2_data = (float *) r2->data;
const int n_elements = ggml_nelements(r1);
for (int i = 0; i < n_elements; ++i) {
sum0 += fabs(r1_data[i]);
sum1 += fabs(r2_data[i]);
diff += fabs(r1_data[i] - r2_data[i]);
//if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
// printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
// printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
//}
}
//for (int i = 4096; i < 4096 + 128; ++i) {
// printf("%f %f\n", r1_data[i], r2_data[i]);
//}
printf("mode: %d\n", mode);
printf("sum0: %f\n", sum0);
printf("sum1: %f\n", sum1);
printf("diff: %f\n", diff);
printf("rel err: %f\n", diff / sum0);
printf("rel err: %f\n", diff / sum1);
GGML_ASSERT(diff / sum0 < 0.0001f);
GGML_ASSERT(diff / sum1 < 0.0001f);
}
}
ggml_free(ctx0);
return 0;
}