Merge branch 'master' of https://github.com/ggerganov/llama.cpp
This commit is contained in:
commit
9c76cdba16
71 changed files with 17066 additions and 20938 deletions
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
|
@ -92,7 +92,7 @@ jobs:
|
|||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-12
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
|
|
@ -48,10 +48,23 @@
|
|||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-apple-clang", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
|
27
Makefile
27
Makefile
|
@ -1,7 +1,6 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
|
@ -34,6 +33,7 @@ BUILD_TARGETS = \
|
|||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-simple-chat \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
|
@ -55,7 +55,6 @@ TEST_TARGETS = \
|
|||
tests/test-llama-grammar \
|
||||
tests/test-log \
|
||||
tests/test-model-load-cancel \
|
||||
tests/test-opt \
|
||||
tests/test-quantize-fns \
|
||||
tests/test-quantize-perf \
|
||||
tests/test-rope \
|
||||
|
@ -63,6 +62,7 @@ TEST_TARGETS = \
|
|||
tests/test-tokenizer-0 \
|
||||
tests/test-tokenizer-1-bpe \
|
||||
tests/test-tokenizer-1-spm
|
||||
# tests/test-opt \
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||
|
@ -915,6 +915,7 @@ endif # GGML_METAL
|
|||
|
||||
OBJ_GGML += \
|
||||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-cpu.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-quants.o \
|
||||
|
@ -935,7 +936,6 @@ OBJ_COMMON = \
|
|||
common/console.o \
|
||||
common/ngram-cache.o \
|
||||
common/sampling.o \
|
||||
common/train.o \
|
||||
common/build-info.o \
|
||||
common/json-schema-to-grammar.o
|
||||
|
||||
|
@ -1047,6 +1047,12 @@ ggml/src/ggml.o: \
|
|||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-cpu.o: \
|
||||
ggml/src/ggml-cpu.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-alloc.o: \
|
||||
ggml/src/ggml-alloc.c \
|
||||
ggml/include/ggml.h \
|
||||
|
@ -1212,11 +1218,6 @@ common/json-schema-to-grammar.o: \
|
|||
common/json-schema-to-grammar.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/train.o: \
|
||||
common/train.cpp \
|
||||
common/train.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/ngram-cache.o: \
|
||||
common/ngram-cache.cpp \
|
||||
common/ngram-cache.h
|
||||
|
@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
@ -1384,11 +1390,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
|
|
@ -20,6 +20,7 @@ var ggmlSources = [
|
|||
"src/ggml.c",
|
||||
"src/ggml-alloc.c",
|
||||
"src/ggml-backend.cpp",
|
||||
"src/ggml-cpu.c",
|
||||
"src/ggml-quants.c",
|
||||
"src/ggml-aarch64.c"
|
||||
]
|
||||
|
@ -34,10 +35,30 @@ var cSettings: [CSetting] = [
|
|||
.define("ACCELERATE_NEW_LAPACK"),
|
||||
.define("ACCELERATE_LAPACK_ILP64"),
|
||||
]
|
||||
|
||||
var sources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-cpu.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-quants.c",
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
"common/sampling.cpp",
|
||||
"common/common.cpp",
|
||||
"common/json-schema-to-grammar.cpp",
|
||||
"common/log.cpp",
|
||||
]
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml/src/ggml-metal.m")
|
||||
ggmlSources.append("src/ggml-metal.m")
|
||||
resources.append(.process("src/ggml-metal.metal"))
|
||||
//resources.append(.process("src/ggml-metal.metal"))
|
||||
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||
|
||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
|
@ -67,30 +88,48 @@ let package = Package(
|
|||
.package(url: "https://github.com/apple/swift-syntax.git", branch: "main")
|
||||
],
|
||||
targets: [
|
||||
.target(name: "llama_cpp",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile",
|
||||
"ggml"
|
||||
],
|
||||
sources: cppSources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings),
|
||||
.target(
|
||||
name: "llama",
|
||||
dependencies: ["llama_cpp"],
|
||||
path: "ggml",
|
||||
sources: ggmlSources,
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "include",
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings),
|
||||
linkerSettings: linkerSettings
|
||||
),
|
||||
// .target(name: "llama_cpp",
|
||||
// path: ".",
|
||||
// exclude: [
|
||||
// "cmake",
|
||||
// "examples",
|
||||
// "scripts",
|
||||
// "models",
|
||||
// "tests",
|
||||
// "CMakeLists.txt",
|
||||
// "Makefile",
|
||||
// "ggml"
|
||||
// ],
|
||||
// sources: cppSources,
|
||||
// publicHeadersPath: "spm-headers",
|
||||
// cSettings: cSettings),
|
||||
// .target(
|
||||
// name: "llama",
|
||||
// dependencies: ["llama_cpp"],
|
||||
// path: "ggml",
|
||||
// sources: ggmlSources,
|
||||
// resources: resources,
|
||||
// publicHeadersPath: "include",
|
||||
// cSettings: cSettings,
|
||||
// linkerSettings: linkerSettings),
|
||||
.target(name: "LlamaObjC",
|
||||
dependencies: ["llama"],
|
||||
path: "objc",
|
||||
|
|
|
@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||
|
||||
## Hot topics
|
||||
|
||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
||||
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
|
|
164
ci/run.sh
164
ci/run.sh
|
@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
|
|||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -460,34 +460,34 @@ function gg_run_pythia_1_4b {
|
|||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -591,36 +591,36 @@ function gg_run_pythia_2_8b {
|
|||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -706,8 +706,8 @@ function gg_run_embd_bge_small {
|
|||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
|
|||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
|
|
16
cmake/arm64-apple-clang.cmake
Normal file
16
cmake/arm64-apple-clang.cmake
Normal file
|
@ -0,0 +1,16 @@
|
|||
set( CMAKE_SYSTEM_NAME Darwin )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-apple-darwin-macho )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
@ -66,8 +66,6 @@ add_library(${TARGET} STATIC
|
|||
ngram-cache.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
train.cpp
|
||||
train.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
|
|
@ -1951,6 +1951,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|||
|
||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
const auto & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
|
|
|
@ -155,7 +155,7 @@ struct common_sampler_params {
|
|||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
|
1515
common/train.cpp
1515
common/train.cpp
File diff suppressed because it is too large
Load diff
233
common/train.h
233
common/train.h
|
@ -1,233 +0,0 @@
|
|||
// Various helper functions and utilities for training
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
||||
|
||||
typedef std::string mt19937_state;
|
||||
|
||||
struct train_state {
|
||||
struct ggml_opt_context * opt;
|
||||
|
||||
uint64_t train_its;
|
||||
uint64_t train_samples;
|
||||
uint64_t train_tokens;
|
||||
uint64_t train_epochs;
|
||||
|
||||
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
||||
mt19937_state shuffle_rng_state_current;
|
||||
mt19937_state shuffle_rng_state_next;
|
||||
size_t shuffle_sample_count;
|
||||
size_t shuffle_next_sample;
|
||||
};
|
||||
|
||||
struct train_params_common {
|
||||
const char * fn_train_data;
|
||||
const char * fn_checkpoint_in;
|
||||
const char * fn_checkpoint_out;
|
||||
const char * pattern_fn_it;
|
||||
const char * fn_latest;
|
||||
|
||||
bool print_usage;
|
||||
|
||||
int save_every;
|
||||
|
||||
uint32_t seed;
|
||||
|
||||
int n_ctx;
|
||||
int n_threads;
|
||||
int n_batch;
|
||||
int n_gradient_accumulation;
|
||||
int n_epochs;
|
||||
int n_gpu_layers;
|
||||
|
||||
bool custom_n_ctx;
|
||||
|
||||
bool use_flash;
|
||||
bool use_checkpointing;
|
||||
|
||||
std::string sample_start;
|
||||
bool include_sample_start;
|
||||
bool escape;
|
||||
bool overlapping_samples;
|
||||
bool fill_with_next_samples;
|
||||
bool separate_with_eos;
|
||||
bool separate_with_bos;
|
||||
bool sample_random_offsets;
|
||||
|
||||
bool force_reshuffle;
|
||||
|
||||
int warmup;
|
||||
int cos_decay_steps;
|
||||
float cos_decay_restart;
|
||||
float cos_decay_min;
|
||||
bool enable_restart;
|
||||
|
||||
int opt_past;
|
||||
float opt_delta;
|
||||
int opt_max_no_improvement;
|
||||
|
||||
int adam_n_iter;
|
||||
float adam_alpha;
|
||||
float adam_min_alpha;
|
||||
float adam_decay;
|
||||
int adam_decay_min_ndim;
|
||||
float adam_beta1;
|
||||
float adam_beta2;
|
||||
float adam_gclip;
|
||||
float adam_eps_f;
|
||||
};
|
||||
|
||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
||||
|
||||
struct train_opt_callback_data {
|
||||
struct train_params_common * params;
|
||||
struct train_state * train;
|
||||
save_train_files_callback save_cb;
|
||||
void * save_data;
|
||||
struct llama_context * lctx;
|
||||
int last_save_iter;
|
||||
llama_token * tokens_data;
|
||||
size_t tokens_size;
|
||||
size_t * samples_begin;
|
||||
size_t * samples_size;
|
||||
size_t * shuffled_samples_offs;
|
||||
size_t * shuffled_samples_begin;
|
||||
size_t * shuffled_samples_size;
|
||||
size_t samples_count;
|
||||
struct ggml_tensor * tokens_input;
|
||||
struct ggml_tensor * target_probs;
|
||||
int first_iter;
|
||||
int first_epoch;
|
||||
int iter_at_last_epoch;
|
||||
int64_t last_time;
|
||||
double millis_per_iter;
|
||||
};
|
||||
|
||||
struct train_state * init_train_state();
|
||||
void free_train_state(struct train_state * state);
|
||||
|
||||
struct train_params_common get_default_train_params_common();
|
||||
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
||||
|
||||
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
||||
void finish_processing_train_args(struct train_params_common * params);
|
||||
|
||||
struct random_normal_distribution;
|
||||
struct random_uniform_distribution;
|
||||
|
||||
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
||||
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
||||
|
||||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
||||
|
||||
// generate random float in interval [0,1)
|
||||
float frand();
|
||||
float frand_normal (struct random_normal_distribution * rnd);
|
||||
float frand_uniform(struct random_uniform_distribution * rnd);
|
||||
|
||||
int clamp (const int v, const int min, const int max);
|
||||
float fclamp(const float v, const float min, const float max);
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||
|
||||
size_t tokenize_file(
|
||||
struct llama_context * lctx,
|
||||
const char * filename,
|
||||
const std::string & sample_start,
|
||||
bool include_sample_start,
|
||||
bool overlapping_samples,
|
||||
unsigned context_length,
|
||||
std::vector<llama_token> & out_tokens,
|
||||
std::vector<size_t> & out_samples_begin,
|
||||
std::vector<size_t> & out_samples_size);
|
||||
|
||||
int64_t get_example_targets_batch(
|
||||
struct llama_context * lctx,
|
||||
struct ggml_tensor * tokens_input,
|
||||
struct ggml_tensor * target_probs,
|
||||
int64_t example_id,
|
||||
const size_t * samples_offs,
|
||||
const size_t * samples_begin,
|
||||
const size_t * samples_size,
|
||||
size_t samples_count,
|
||||
const llama_token * train_data,
|
||||
size_t n_train_data,
|
||||
bool separate_with_eos,
|
||||
bool separate_with_bos,
|
||||
bool fill_with_next_samples,
|
||||
bool sample_random_offsets);
|
||||
|
||||
|
||||
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
||||
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
||||
mt19937_state mt19937_seed_to_state(unsigned seed);
|
||||
|
||||
mt19937_state shuffle_samples(
|
||||
const mt19937_state & rng_state,
|
||||
size_t * shuffled_offs,
|
||||
size_t * shuffled_begins,
|
||||
size_t * shuffled_sizes,
|
||||
const size_t * begins,
|
||||
const size_t * sizes,
|
||||
size_t count);
|
||||
|
||||
size_t hash_combine(size_t h1, size_t h2);
|
||||
|
||||
size_t compute_samples_hash(
|
||||
const char* fn,
|
||||
const size_t* samples_begin,
|
||||
const size_t* samples_size,
|
||||
size_t sample_count);
|
||||
|
||||
|
||||
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
||||
|
||||
void print_duration(double milliseconds);
|
||||
|
||||
float cosine_decay(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum);
|
||||
|
||||
float cosine_decay_restart(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum,
|
||||
float restart_step_mult);
|
||||
|
||||
float learning_schedule(
|
||||
int64_t step,
|
||||
int64_t warmup_steps,
|
||||
int64_t decay_steps,
|
||||
float learning_rate,
|
||||
float overall_minimum,
|
||||
float cos_decay_minimum,
|
||||
float cos_decay_restart_step_mult,
|
||||
bool enable_restart);
|
||||
|
||||
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
||||
|
||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
||||
|
||||
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
||||
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
||||
|
||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
|
@ -72,7 +72,8 @@ class Model:
|
|||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
|
||||
|
@ -87,7 +88,7 @@ class Model:
|
|||
self.is_safetensors = len(self.part_names) > 0
|
||||
if not self.is_safetensors:
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self.tensor_names = None
|
||||
|
@ -1541,6 +1542,17 @@ class LlamaModel(Model):
|
|||
special_vocab._set_special_token("eot", 32010)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
|
@ -1557,17 +1569,6 @@ class LlamaModel(Model):
|
|||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
|
|
|
@ -12,6 +12,7 @@ import json
|
|||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||
from transformers import AutoConfig
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
|
|||
help="only print out what will be done, without writing any new files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base", type=Path, required=True,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
|
||||
"--base", type=Path,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
|
@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
|
|||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||
config = AutoConfig.from_pretrained(hf_model_id)
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
@ -281,7 +288,7 @@ if __name__ == '__main__':
|
|||
|
||||
ftype = ftype_map[args.outtype]
|
||||
|
||||
dir_base_model: Path = args.base
|
||||
dir_base_model: Path | None = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
|
@ -301,9 +308,29 @@ if __name__ == '__main__':
|
|||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||
|
||||
# load LoRA config
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
# load base model
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
if dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
try:
|
||||
hparams = load_hparams_from_hf(model_id)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to load base model config: {e}")
|
||||
logger.error("Please try downloading the base model and add its path to --base")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
|
||||
logger.error("Base model config is required. Please download the base model and add its path to --base")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
|
@ -323,13 +350,15 @@ if __name__ == '__main__':
|
|||
self.dir_model_card = dir_lora_model
|
||||
self.lora_alpha = float(lora_alpha)
|
||||
|
||||
def set_vocab(self):
|
||||
pass
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||
super().set_gguf_parameters()
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||
|
@ -350,7 +379,7 @@ if __name__ == '__main__':
|
|||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
|
||||
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||
sys.exit(1)
|
||||
|
||||
if base_name in tensor_map:
|
||||
|
@ -384,9 +413,6 @@ if __name__ == '__main__':
|
|||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
alpha: float = lparams["lora_alpha"]
|
||||
|
||||
model_instance = LoraModel(
|
||||
|
@ -399,6 +425,7 @@ if __name__ == '__main__':
|
|||
dry_run=args.dry_run,
|
||||
dir_lora_model=dir_lora,
|
||||
lora_alpha=alpha,
|
||||
hparams=hparams,
|
||||
)
|
||||
|
||||
logger.info("Exporting model...")
|
||||
|
|
|
@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
|
@ -49,6 +48,7 @@ else()
|
|||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
endif()
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
File diff suppressed because it is too large
Load diff
|
@ -4,6 +4,7 @@
|
|||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
#include "ggml-cpu.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
|
|
@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
|
|||
|
||||
### GET `/slots`: Returns the current slots processing state
|
||||
|
||||
This endpoint can be disabled with `--no-slots`
|
||||
> [!WARNING]
|
||||
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
|
||||
|
||||
This endpoint is disabled by default and can be enabled with `--slots`
|
||||
|
||||
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
|
||||
|
||||
|
@ -709,6 +712,7 @@ Example:
|
|||
"grammar": "",
|
||||
"id": 0,
|
||||
"ignore_eos": false,
|
||||
"is_processing": false,
|
||||
"logit_bias": [],
|
||||
"min_p": 0.05000000074505806,
|
||||
"mirostat": 0,
|
||||
|
@ -741,7 +745,6 @@ Example:
|
|||
"temperature"
|
||||
],
|
||||
"seed": 42,
|
||||
"state": 1,
|
||||
"stop": [
|
||||
"\n"
|
||||
],
|
||||
|
@ -755,10 +758,6 @@ Example:
|
|||
]
|
||||
```
|
||||
|
||||
Possible values for `slot[i].state` are:
|
||||
- `0`: SLOT_STATE_IDLE
|
||||
- `1`: SLOT_STATE_PROCESSING
|
||||
|
||||
### GET `/metrics`: Prometheus compatible metrics exporter
|
||||
|
||||
This endpoint is only accessible if `--metrics` is set.
|
||||
|
|
|
@ -247,6 +247,7 @@ struct server_slot {
|
|||
if (is_processing()) {
|
||||
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
||||
|
||||
t_last_used = ggml_time_us();
|
||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||
state = SLOT_STATE_IDLE;
|
||||
callback_on_release(id);
|
||||
|
@ -725,12 +726,12 @@ struct server_context {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
server_slot * get_available_slot(const std::string & prompt) {
|
||||
server_slot * get_available_slot(const server_task & task) {
|
||||
server_slot * ret = nullptr;
|
||||
|
||||
// find the slot that has at least n% prompt similarity
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
|
||||
int max_lcp_len = 0;
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
||||
int lcs_len = 0;
|
||||
float similarity = 0;
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
|
@ -740,25 +741,26 @@ struct server_context {
|
|||
}
|
||||
|
||||
// skip the slot if it does not contains cached tokens
|
||||
if (slot.prompt_tokens.empty()) {
|
||||
if (slot.cache_tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
||||
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
|
||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||
|
||||
// fraction of the common substring length compared to the current slot's prompt length
|
||||
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
|
||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||
|
||||
// select the current slot if the criteria match
|
||||
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
|
||||
max_lcp_len = lcp_len;
|
||||
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
|
||||
lcs_len = cur_lcs_len;
|
||||
similarity = cur_similarity;
|
||||
ret = &slot;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != nullptr) {
|
||||
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
|
||||
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1514,18 +1516,7 @@ struct server_context {
|
|||
{
|
||||
const int id_slot = json_value(task.data, "id_slot", -1);
|
||||
|
||||
server_slot * slot;
|
||||
|
||||
if (id_slot != -1) {
|
||||
slot = get_slot_by_id(id_slot);
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
prompt = json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
}
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
|
@ -1575,11 +1566,11 @@ struct server_context {
|
|||
|
||||
for (server_slot & slot : slots) {
|
||||
json slot_data = get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["id_task"] = slot.id_task;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
||||
slot_data["next_token"] = {
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["id_task"] = slot.id_task;
|
||||
slot_data["is_processing"] = slot.is_processing();
|
||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"has_new_line", slot.has_new_line},
|
||||
{"n_remain", slot.n_remaining},
|
||||
|
@ -1590,10 +1581,10 @@ struct server_context {
|
|||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
|
||||
if (slot_data["state"] == SLOT_STATE_IDLE) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
if (slot.is_processing()) {
|
||||
n_processing_slots++;
|
||||
} else {
|
||||
n_idle_slots++;
|
||||
}
|
||||
|
||||
slots_data.push_back(slot_data);
|
||||
|
@ -2714,8 +2705,8 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2820,8 +2811,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// TODO: maybe merge this function with "handle_completions_generic"
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2946,11 +2937,6 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
// TODO: somehow clean up this checks in the future
|
||||
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
const json body = json::parse(req.body);
|
||||
bool is_openai = false;
|
||||
|
||||
|
@ -3002,10 +2988,11 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
// TODO: implement
|
||||
|
@ -3259,7 +3246,7 @@ int main(int argc, char ** argv) {
|
|||
ctx_server.queue_tasks.terminate();
|
||||
};
|
||||
|
||||
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
|
|
|
@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
|
|||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||
match expected_slot_status_string:
|
||||
case 'idle':
|
||||
expected_slot_status = 0
|
||||
expected_slot_status = False
|
||||
case 'busy':
|
||||
expected_slot_status = 1
|
||||
expected_slot_status = True
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
|
||||
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
|
||||
for slot_id in range(context.n_slots)]
|
||||
await request_slots_status(context, expected_slots)
|
||||
|
||||
|
@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
|
|||
if status_code == 503 and status_code == expected_http_status_code:
|
||||
return
|
||||
if status_code == 200 and status_code == expected_http_status_code:
|
||||
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
|
||||
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
|
||||
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
|
||||
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
|
||||
if ((slots_idle is None or slots_idle == n_slots_idle)
|
||||
and (slots_processing is None or slots_processing == n_slots_processing)):
|
||||
return
|
||||
|
|
|
@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
|
|||
// other common utils
|
||||
//
|
||||
|
||||
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
||||
// check for empty sequences
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return i;
|
||||
// get the lengths of the input sequences
|
||||
size_t a_len = a.size();
|
||||
size_t b_len = b.size();
|
||||
|
||||
// initialize the maximum length of the longest common subsequence (LCS)
|
||||
size_t max_length = 0;
|
||||
|
||||
// use two rows instead of a 2D matrix to optimize space
|
||||
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||
|
||||
// iterate through the elements of a
|
||||
for (size_t i = 1; i <= a_len; i++) {
|
||||
// iterate through the elements of b
|
||||
for (size_t j = 1; j <= b_len; j++) {
|
||||
// if elements at the current positions match
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
// if it's the first element of either sequences, set LCS length to 1
|
||||
if (i == 1 || j == 1) {
|
||||
curr_row[j] = 1;
|
||||
} else {
|
||||
// increment LCS length by 1 compared to the previous element
|
||||
curr_row[j] = prev_row[j - 1] + 1;
|
||||
}
|
||||
|
||||
// update max_length if necessary
|
||||
if (curr_row[j] > max_length) {
|
||||
max_length = curr_row[j];
|
||||
}
|
||||
} else {
|
||||
// reset LCS length if elements don't match
|
||||
curr_row[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// update the previous row for the next iteration
|
||||
prev_row = curr_row;
|
||||
}
|
||||
|
||||
// return the maximum length of the LCS
|
||||
return max_length;
|
||||
}
|
||||
|
||||
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||
|
|
5
examples/simple-chat/CMakeLists.txt
Normal file
5
examples/simple-chat/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET llama-simple-chat)
|
||||
add_executable(${TARGET} simple-chat.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7
examples/simple-chat/README.md
Normal file
7
examples/simple-chat/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# llama.cpp/example/simple-chat
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
|
||||
|
||||
```bash
|
||||
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
|
||||
...
|
197
examples/simple-chat/simple-chat.cpp
Normal file
197
examples/simple-chat/simple-chat.cpp
Normal file
|
@ -0,0 +1,197 @@
|
|||
#include "llama.h"
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::string model_path;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
|
||||
// parse command line arguments
|
||||
for (int i = 1; i < argc; i++) {
|
||||
try {
|
||||
if (strcmp(argv[i], "-m") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
model_path = argv[++i];
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-c") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
n_ctx = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
ngl = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (model_path.empty()) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// only print errors
|
||||
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
||||
if (level >= GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
}
|
||||
}, nullptr);
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the context
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the sampler
|
||||
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
||||
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// convert the token to a string, print it and add it to the response
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
}
|
||||
std::string piece(buf, n);
|
||||
printf("%s", piece.c_str());
|
||||
fflush(stdout);
|
||||
response += piece;
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
return response;
|
||||
};
|
||||
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::vector<char> formatted(llama_n_ctx(ctx));
|
||||
int prev_len = 0;
|
||||
while (true) {
|
||||
// get user input
|
||||
printf("\033[32m> \033[0m");
|
||||
std::string user;
|
||||
std::getline(std::cin, user);
|
||||
|
||||
if (user.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// add the user input to the message list and format it
|
||||
messages.push_back({"user", strdup(user.c_str())});
|
||||
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
if (new_len > (int)formatted.size()) {
|
||||
formatted.resize(new_len);
|
||||
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
}
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// remove previous messages to obtain the prompt to generate the response
|
||||
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
||||
|
||||
// generate a response
|
||||
printf("\033[33m");
|
||||
std::string response = generate(prompt);
|
||||
printf("\n\033[0m");
|
||||
|
||||
// add the response to the messages
|
||||
messages.push_back({"assistant", strdup(response.c_str())});
|
||||
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
|
||||
if (prev_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// free resources
|
||||
for (auto & msg : messages) {
|
||||
free(const_cast<char *>(msg.content));
|
||||
}
|
||||
llama_sampler_free(smpl);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
20
flake.lock
generated
20
flake.lock
generated
|
@ -5,11 +5,11 @@
|
|||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1727826117,
|
||||
"narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=",
|
||||
"lastModified": 1730504689,
|
||||
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1",
|
||||
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1729665710,
|
||||
"narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=",
|
||||
"lastModified": 1730200266,
|
||||
"narHash": "sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d",
|
||||
"rev": "807e9154dcb16384b1b765ebe9cd2bba2ac287fd",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -36,14 +36,14 @@
|
|||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1727825735,
|
||||
"narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=",
|
||||
"lastModified": 1730504152,
|
||||
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
|
|
@ -305,27 +305,10 @@ extern "C" {
|
|||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
38
ggml/include/ggml-cpp.h
Normal file
38
ggml/include/ggml-cpp.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
// ggml
|
||||
|
||||
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
||||
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
||||
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
// ggml-alloc
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
||||
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
||||
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
||||
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
150
ggml/include/ggml-cpu.h
Normal file
150
ggml/include/ggml-cpu.h
Normal file
|
@ -0,0 +1,150 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
// TODO: move to backend interface
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -217,7 +217,6 @@
|
|||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_PARAMS 2048
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_SRC 10
|
||||
#define GGML_MAX_N_THREADS 512
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
|
@ -559,10 +558,10 @@ extern "C" {
|
|||
|
||||
enum ggml_log_level {
|
||||
GGML_LOG_LEVEL_NONE = 0,
|
||||
GGML_LOG_LEVEL_INFO = 1,
|
||||
GGML_LOG_LEVEL_WARN = 2,
|
||||
GGML_LOG_LEVEL_ERROR = 3,
|
||||
GGML_LOG_LEVEL_DEBUG = 4,
|
||||
GGML_LOG_LEVEL_DEBUG = 1,
|
||||
GGML_LOG_LEVEL_INFO = 2,
|
||||
GGML_LOG_LEVEL_WARN = 3,
|
||||
GGML_LOG_LEVEL_ERROR = 4,
|
||||
GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
||||
};
|
||||
|
||||
|
@ -574,6 +573,13 @@ extern "C" {
|
|||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
@ -619,66 +625,6 @@ extern "C" {
|
|||
// If it returns true, the computation is aborted
|
||||
typedef bool (*ggml_abort_callback)(void * data);
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
void * data;
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
//
|
||||
// GUID
|
||||
|
@ -701,9 +647,6 @@ extern "C" {
|
|||
// accepts a UTF-8 path, even on Windows
|
||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
|
@ -760,12 +703,12 @@ extern "C" {
|
|||
|
||||
// main
|
||||
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
GGML_API void ggml_free(struct ggml_context * ctx);
|
||||
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
||||
GGML_API void ggml_reset(struct ggml_context * ctx);
|
||||
GGML_API void ggml_free (struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
||||
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
||||
|
||||
|
@ -805,8 +748,7 @@ extern "C" {
|
|||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||
|
@ -816,35 +758,25 @@ extern "C" {
|
|||
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
// Converts a flat index into coordinates
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
// Tensor flags
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
//
|
||||
|
@ -2060,9 +1992,6 @@ extern "C" {
|
|||
// automatic differentiation
|
||||
//
|
||||
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
||||
|
||||
|
@ -2094,27 +2023,6 @@ extern "C" {
|
|||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
|
@ -2285,6 +2193,8 @@ extern "C" {
|
|||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
|
@ -2316,12 +2226,6 @@ extern "C" {
|
|||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// tensor flags
|
||||
//
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
@ -2490,8 +2394,6 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
|
@ -2508,17 +2410,9 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
GGML_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
|
@ -2527,14 +2421,6 @@ extern "C" {
|
|||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits {
|
||||
const char * type_name;
|
||||
|
@ -2545,13 +2431,6 @@ extern "C" {
|
|||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
|
|
@ -800,6 +800,7 @@ if (GGML_KOMPUTE)
|
|||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q4_k.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
|
@ -833,6 +834,7 @@ if (GGML_KOMPUTE)
|
|||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q4_k.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f32.h
|
||||
shaderop_getrows_f16.h
|
||||
|
@ -1364,9 +1366,12 @@ endif()
|
|||
|
||||
add_library(ggml
|
||||
../include/ggml.h
|
||||
../include/ggml-cpu.h
|
||||
../include/ggml-alloc.h
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
ggml.c
|
||||
ggml-cpu.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-quants.c
|
||||
|
@ -1391,7 +1396,7 @@ if (EMSCRIPTEN)
|
|||
endif()
|
||||
|
||||
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
||||
target_include_directories(ggml PUBLIC ../include)
|
||||
target_include_directories(ggml PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
||||
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
||||
|
@ -1400,7 +1405,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
|
|||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT GGML_SYCL)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
|||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||
|
||||
return buffer;
|
||||
|
|
13720
ggml/src/ggml-cpu.c
Normal file
13720
ggml/src/ggml-cpu.c
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|||
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
||||
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
} else {
|
||||
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||
if (err != cudaErrorPeerAccessNotEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3107,18 +3113,20 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -36,6 +37,20 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
static inline int ggml_up32(int n) {
|
||||
return (n + 31) & ~31;
|
||||
}
|
||||
|
||||
//static inline int ggml_up64(int n) {
|
||||
// return (n + 63) & ~63;
|
||||
//}
|
||||
|
||||
static inline int ggml_up(int n, int m) {
|
||||
// assert m is a power of 2
|
||||
GGML_ASSERT((m & (m - 1)) == 0);
|
||||
return (n + m - 1) & ~(m - 1);
|
||||
}
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
|||
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||
|
||||
#define GGML_DEBUG 0
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
// tensor params
|
||||
|
||||
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
||||
assert(params_size <= GGML_MAX_OP_PARAMS);
|
||||
memcpy(tensor->op_params, params, params_size);
|
||||
}
|
||||
|
||||
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
return ((const int32_t *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
return ((const float *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
((int32_t *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
((float *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
struct ggml_map_custom1_op_params {
|
||||
ggml_custom1_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom2_op_params {
|
||||
ggml_custom2_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
|||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
// TODO: move to threading file
|
||||
void ggml_critical_section_start(void);
|
||||
void ggml_critical_section_end(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "shaderop_mul_mat_q8_0.h"
|
||||
#include "shaderop_mul_mat_q4_0.h"
|
||||
#include "shaderop_mul_mat_q4_1.h"
|
||||
#include "shaderop_mul_mat_q4_k.h"
|
||||
#include "shaderop_mul_mat_q6_k.h"
|
||||
#include "shaderop_mul_mat_mat_f32.h"
|
||||
#include "shaderop_getrows_f32.h"
|
||||
|
@ -1067,6 +1068,40 @@ static void ggml_vk_mul_mat_q8_0(Args&&... args) {
|
|||
ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat_q4_k(
|
||||
kp::Sequence& seq,
|
||||
const std::shared_ptr<kp::Tensor>& inA,
|
||||
const std::shared_ptr<kp::Tensor>& inB,
|
||||
const std::shared_ptr<kp::Tensor>& out,
|
||||
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
||||
int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
|
||||
int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
|
||||
int32_t ne1, int32_t r2, int32_t r3
|
||||
) {
|
||||
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
|
||||
kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
|
||||
|
||||
struct PushConstants {
|
||||
uint32_t inAOff, inBOff, outOff;
|
||||
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
|
||||
} pushConsts {
|
||||
0, 0, 0,
|
||||
ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
|
||||
};
|
||||
|
||||
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
||||
if (!komputeManager()->hasAlgorithm(__func__)) {
|
||||
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
|
||||
} else {
|
||||
s_algo = komputeManager()->getAlgorithm(__func__);
|
||||
s_algo->setTensors({inA, inB, out});
|
||||
s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
|
||||
s_algo->setPushConstants<PushConstants>({pushConsts});
|
||||
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
||||
}
|
||||
seq.record<kp::OpAlgoDispatch>(s_algo);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat_q6_k(
|
||||
kp::Sequence& seq,
|
||||
const std::shared_ptr<kp::Tensor>& inA,
|
||||
|
@ -1384,6 +1419,7 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return true;
|
||||
default:
|
||||
;
|
||||
|
@ -1635,6 +1671,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
|
||||
);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
ggml_vk_mul_mat_q4_k(
|
||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
|
||||
);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
ggml_vk_mul_mat_q6_k(
|
||||
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
||||
|
|
|
@ -450,7 +450,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
}
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,436 @@ using namespace metal;
|
|||
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
constexpr constant static float kvalues_iq4nl_f[16] = {
|
||||
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
||||
};
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.h * xb->d;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb->m;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
||||
const float d = xb->d;
|
||||
const float md = -16.h * xb->d;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[i/2][2*(i%2)+0] = d * x0 + md;
|
||||
reg[i/2][2*(i%2)+1] = d * x1 + md;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
||||
const float d = xb->d;
|
||||
const float m = xb->m;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[i/2][2*(i%2)+0] = d * x0 + m;
|
||||
reg[i/2][2*(i%2)+1] = d * x1 + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const half d = xb->d;
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
||||
const float d = xb->d;
|
||||
const float min = xb->dmin;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
float dl, ml;
|
||||
uint8_t sc = xb->scales[il];
|
||||
|
||||
q = q + 32*(il/8) + 16*(il&1);
|
||||
il = (il/2)%4;
|
||||
|
||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
q = q + 32 * (il/8) + 16 * (il&1);
|
||||
h = h + 16 * (il&1);
|
||||
uint8_t m = 1 << (il/2);
|
||||
uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
|
||||
((il/4)>0 ? 12 : 3);
|
||||
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
||||
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
||||
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
||||
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
||||
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
|
||||
const float ml = 4.f * dl;
|
||||
|
||||
il = (il/2) & 3;
|
||||
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl *= coef;
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
|
||||
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uchar * q = xb->qs;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + (il/4) * 32 + 16 * (il&1);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.h;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uint8_t * q = xb->qs;
|
||||
device const uint8_t * qh = xb->qh;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + 32 * (il/4) + 16 * (il&1);
|
||||
qh = qh + 16 * (il&1);
|
||||
uint8_t ul = 1 << (il/2);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.f;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
const float qh_val = il<2 ? 16.f : 256.f;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint8_t * ql = (device const uint8_t *)xb->ql;
|
||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
qh = qh + 32*(il/8) + 16*(il&1);
|
||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
||||
const float ml = d_all * sc * 32.f;
|
||||
const float dl = d_all * sc * coef;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
|
||||
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
// each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const uint32_t aux32_g = q2[0] | (q2[1] << 16);
|
||||
const uint32_t aux32_s = q2[2] | (q2[3] << 16);
|
||||
thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
|
||||
const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
|
||||
uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
|
||||
signs = ksigns_iq2xs[q2[2*il+1] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * q3 = xb->qs + 8*ib32;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
|
||||
grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
|
||||
signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 8*ib32;
|
||||
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
||||
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
||||
grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
||||
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
||||
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
const float d = xb->d;
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint16_t * qh = xb->qh;
|
||||
const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
|
||||
const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
|
||||
const uint16_t h = qh[ib32] >> 6*il;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = scale.f16;
|
||||
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||
|
||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml1;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
|
||||
const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
|
||||
const float d = (float)xb->d * (ls - 32);
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
|
@ -2776,11 +3206,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
const short iv3 = iq3 / rv3;
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
float4 mq[D4];
|
||||
float4 mq[D4/NW];
|
||||
|
||||
for (short ii = 0; ii < D4; ii += NW) {
|
||||
short i = ii + tiisg;
|
||||
mq[i] = (float4) sq4[i];
|
||||
mq[ii/NW] = (float4) sq4[i];
|
||||
}
|
||||
|
||||
// pointer to the mask
|
||||
|
@ -2812,7 +3242,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
mk[2] = (float4) pk4[i + 2*(nb11/8)];
|
||||
mk[3] = (float4) pk4[i + 3*(nb11/8)];
|
||||
|
||||
mqk += (float4) (mq[i] * mk);
|
||||
mqk += (float4) (mq[ii/NW] * mk);
|
||||
}
|
||||
|
||||
// reduce the results from the threads in the simdgroup
|
||||
|
@ -2857,8 +3287,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
// O = diag(ms)*O
|
||||
#pragma unroll
|
||||
for (short ii = 0; ii < D4; ii += NW) {
|
||||
const short i = ii + tiisg;
|
||||
lo[i/NW] *= ms;
|
||||
lo[ii/NW] *= ms;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2872,10 +3301,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
for (short ii = 0; ii < D4; ii += NW) {
|
||||
const short i = ii + tiisg;
|
||||
|
||||
lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
|
||||
lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
|
||||
lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
|
||||
lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
|
||||
lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
|
||||
lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
|
||||
lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
|
||||
lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3340,10 +3769,6 @@ static inline int best_index_int8(int n, constant float * val, float x) {
|
|||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
constexpr constant static float kvalues_iq4nl_f[16] = {
|
||||
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
||||
};
|
||||
|
||||
kernel void kernel_cpy_f32_iq4_nl(
|
||||
device const float * src0,
|
||||
device void * dst,
|
||||
|
@ -5458,440 +5883,6 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
|||
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
float4x4 temp = *(((device float4x4 *)src));
|
||||
for (int i = 0; i < 16; i++){
|
||||
reg[i/4][i%4] = temp[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
||||
half4x4 temp = *(((device half4x4 *)src));
|
||||
for (int i = 0; i < 16; i++){
|
||||
reg[i/4][i%4] = temp[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.h * xb->d;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb->m;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
||||
const float d = xb->d;
|
||||
const float md = -16.h * xb->d;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[i/2][2*(i%2)+0] = d * x0 + md;
|
||||
reg[i/2][2*(i%2)+1] = d * x1 + md;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
||||
const float d = xb->d;
|
||||
const float m = xb->m;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[i/2][2*(i%2)+0] = d * x0 + m;
|
||||
reg[i/2][2*(i%2)+1] = d * x1 + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const half d = xb->d;
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
||||
const float d = xb->d;
|
||||
const float min = xb->dmin;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
float dl, ml;
|
||||
uint8_t sc = xb->scales[il];
|
||||
|
||||
q = q + 32*(il/8) + 16*(il&1);
|
||||
il = (il/2)%4;
|
||||
|
||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
q = q + 32 * (il/8) + 16 * (il&1);
|
||||
h = h + 16 * (il&1);
|
||||
uint8_t m = 1 << (il/2);
|
||||
uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
|
||||
((il/4)>0 ? 12 : 3);
|
||||
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
||||
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
||||
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
||||
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
||||
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
|
||||
const float ml = 4.f * dl;
|
||||
|
||||
il = (il/2) & 3;
|
||||
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl *= coef;
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
|
||||
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uchar * q = xb->qs;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + (il/4) * 32 + 16 * (il&1);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.h;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uint8_t * q = xb->qs;
|
||||
device const uint8_t * qh = xb->qh;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + 32 * (il/4) + 16 * (il&1);
|
||||
qh = qh + 16 * (il&1);
|
||||
uint8_t ul = 1 << (il/2);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.f;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
const float qh_val = il<2 ? 16.f : 256.f;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint8_t * ql = (device const uint8_t *)xb->ql;
|
||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
qh = qh + 32*(il/8) + 16*(il&1);
|
||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
||||
const float ml = d_all * sc * 32.f;
|
||||
const float dl = d_all * sc * coef;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
|
||||
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
// each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const uint32_t aux32_g = q2[0] | (q2[1] << 16);
|
||||
const uint32_t aux32_s = q2[2] | (q2[3] << 16);
|
||||
thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
|
||||
const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
|
||||
uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
|
||||
signs = ksigns_iq2xs[q2[2*il+1] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * q3 = xb->qs + 8*ib32;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
|
||||
grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
|
||||
signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 8*ib32;
|
||||
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
||||
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
||||
grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
||||
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
||||
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
const float d = xb->d;
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint16_t * qh = xb->qh;
|
||||
const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
|
||||
const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
|
||||
const uint16_t h = qh[ib32] >> 6*il;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = scale.f16;
|
||||
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||
|
||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml1;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
|
||||
const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
|
||||
const float d = (float)xb->d * (ls - 32);
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows_q(
|
||||
device const void * src0,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xF);
|
||||
const __m128i m3 = _mm_set1_epi8(3);
|
||||
const __m128i m32s = _mm_set1_epi8(32);
|
||||
const __m128i m2 = _mm_set1_epi8(2);
|
||||
const __m128i m15 = _mm_set1_epi8(15);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
|
@ -9119,12 +9117,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
// handle the q6_k -32 offset separately using bsums
|
||||
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
|
||||
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
|
||||
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
||||
const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
|
||||
const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
|
||||
const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
|
||||
const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
|
||||
|
||||
__m128i sumi_0 = _mm_setzero_si128();
|
||||
__m128i sumi_1 = _mm_setzero_si128();
|
||||
|
||||
__m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
||||
int is = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
|
||||
const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
|
||||
|
@ -9132,26 +9138,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
|
||||
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
|
||||
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
|
||||
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
|
||||
const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
|
||||
const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
|
||||
const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
|
||||
const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
|
||||
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
|
||||
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
|
||||
const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
|
||||
const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
|
||||
const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
|
||||
const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
|
||||
|
||||
const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
|
||||
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
|
||||
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
|
||||
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
|
||||
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
|
||||
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
|
||||
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
|
||||
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
|
||||
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
|
||||
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
|
||||
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
|
||||
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
|
||||
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
|
||||
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
|
||||
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
|
||||
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
|
||||
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
|
||||
|
||||
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
|
@ -9162,15 +9168,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
|
||||
__m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
|
||||
__m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
|
||||
__m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
|
||||
__m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
|
||||
__m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
|
||||
__m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
|
||||
__m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
|
||||
__m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
|
||||
|
||||
__m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
|
||||
__m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
|
||||
__m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
|
||||
|
@ -9180,32 +9177,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
__m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
|
||||
__m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
|
||||
|
||||
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
||||
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
||||
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
||||
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
||||
p16_4 = _mm_sub_epi16(p16_4, q8s_4);
|
||||
p16_5 = _mm_sub_epi16(p16_5, q8s_5);
|
||||
p16_6 = _mm_sub_epi16(p16_6, q8s_6);
|
||||
p16_7 = _mm_sub_epi16(p16_7, q8s_7);
|
||||
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
||||
is += 4;
|
||||
|
||||
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
||||
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
|
||||
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
|
||||
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
||||
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
|
||||
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
|
||||
p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
|
||||
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
|
||||
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
|
||||
p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
|
||||
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
|
||||
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
|
||||
|
||||
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
||||
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
||||
|
@ -9214,8 +9199,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
}
|
||||
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
|
||||
sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
|
||||
const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
|
|
@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
|||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
UNUSED(dev);
|
||||
UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
UNUSED(dev);
|
||||
UNUSED(op);
|
||||
|
@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
|||
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
|
|
|
@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
return buf;
|
||||
}
|
||||
|
||||
buf->size = size;
|
||||
vk::BufferCreateInfo buffer_create_info{
|
||||
vk::BufferCreateFlags(),
|
||||
size,
|
||||
|
@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
|
||||
if (memory_type_index == UINT32_MAX) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
|
@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
}
|
||||
catch (const vk::SystemError& e) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
// Out of Host/Device memory, clean up buffer
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
||||
|
||||
buf->device = device;
|
||||
buf->size = size;
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
device->memory_logger->log_allocation(buf, size);
|
||||
|
|
15449
ggml/src/ggml.c
15449
ggml/src/ggml.c
File diff suppressed because it is too large
Load diff
|
@ -15,6 +15,7 @@
|
|||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
|
@ -64,6 +65,14 @@ mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
|||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_k 144
|
||||
struct block_q4_k {
|
||||
float16_t d;
|
||||
float16_t dmin;
|
||||
uint8_t scales[K_SCALE_SIZE];
|
||||
uint8_t qs[QK_K/2];
|
||||
};
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
|
|
133
ggml/src/kompute-shaders/op_mul_mat_q4_k.comp
Normal file
133
ggml/src/kompute-shaders/op_mul_mat_q4_k.comp
Normal file
|
@ -0,0 +1,133 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define N_DST 4
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_k
|
||||
|
||||
layout(local_size_x = 4) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
int r2;
|
||||
int r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint16_t kmask1 = uint16_t(0x3f3f);
|
||||
const uint16_t kmask2 = uint16_t(0x0f0f);
|
||||
const uint16_t kmask3 = uint16_t(0xc0c0);
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/8; // 0...3
|
||||
const uint it = gl_SubgroupInvocationID%8; // 0...7
|
||||
const uint iq = it/4; // 0 or 1
|
||||
const uint ir = it%4; // 0...3
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = r0 * N_DST;
|
||||
const uint ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint xblk = ib_row + offset0 + pcs.inAOff;
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
|
||||
float all_sum = 0.f;
|
||||
|
||||
uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 4) {
|
||||
const uint blk_idx = ib + xblk;
|
||||
|
||||
float sumy[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
|
||||
yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
|
||||
yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
|
||||
yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
uint row_idx = row * nb;
|
||||
|
||||
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
||||
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
||||
uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
|
||||
uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
|
||||
uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
|
||||
|
||||
uint16_t sc16[4];
|
||||
sc16[0] = sc_0 & kmask1;
|
||||
sc16[1] = sc_2 & kmask1;
|
||||
sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
|
||||
sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
|
||||
|
||||
float acc1[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
float acc2[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
|
||||
uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
|
||||
acc1[0] += yl[i+0] * (q1 & 0x000F);
|
||||
acc1[1] += yl[i+1] * (q1 & 0x0F00);
|
||||
acc1[2] += yl[i+8] * (q1 & 0x00F0);
|
||||
acc1[3] += yl[i+9] * (q1 & 0xF000);
|
||||
acc2[0] += yh[i+0] * (q2 & 0x000F);
|
||||
acc2[1] += yh[i+1] * (q2 & 0x0F00);
|
||||
acc2[2] += yh[i+8] * (q2 & 0x00F0);
|
||||
acc2[3] += yh[i+9] * (q2 & 0xF000);
|
||||
}
|
||||
|
||||
uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
|
||||
uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
|
||||
uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
|
||||
uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
|
||||
uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
|
||||
uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
|
||||
uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
|
||||
uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
|
||||
|
||||
float dall = float(inA[blk_idx + row_idx].d);
|
||||
float dmin = float(inA[blk_idx + row_idx].dmin);
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
|
||||
(acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
|
||||
(acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
|
||||
dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
|
||||
}
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
|
|
@ -16,6 +16,11 @@
|
|||
return self->threadpool;
|
||||
}
|
||||
|
||||
- (void)dealloc
|
||||
{
|
||||
ggml_threadpool_free(threadpool);
|
||||
[super dealloc];
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation GGMLThreadpoolParams {
|
||||
|
|
|
@ -46,4 +46,10 @@
|
|||
common_sampler_reset(sampler);
|
||||
}
|
||||
|
||||
- (void)dealloc
|
||||
{
|
||||
common_sampler_free(sampler);
|
||||
[super dealloc];
|
||||
}
|
||||
|
||||
@end
|
||||
|
|
|
@ -20,8 +20,8 @@
|
|||
|
||||
- (void)dealloc
|
||||
{
|
||||
[super dealloc];
|
||||
llama_free(ctx);
|
||||
[super dealloc];
|
||||
}
|
||||
|
||||
- (void)attachThreadpool:(GGMLThreadpool *)threadpool
|
||||
|
|
|
@ -24,12 +24,8 @@
|
|||
|
||||
- (void)dealloc
|
||||
{
|
||||
[super dealloc];
|
||||
llama_free_model(model);
|
||||
}
|
||||
|
||||
- (LlamaContext *)context:(LlamaContextParams *)params {
|
||||
return nil;
|
||||
[super dealloc];
|
||||
}
|
||||
|
||||
- (BOOL)addBOSToken {
|
||||
|
|
|
@ -72,6 +72,18 @@
|
|||
[outputCondition unlock];
|
||||
return line;
|
||||
}
|
||||
|
||||
- (void)dealloc
|
||||
{
|
||||
[outputCondition unlock];
|
||||
[inputCondition unlock];
|
||||
[inputCondition dealloc];
|
||||
[outputCondition dealloc];
|
||||
[log dealloc];
|
||||
[inputQueue dealloc];
|
||||
[outputQueue dealloc];
|
||||
[super dealloc];
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation LlamaSession {
|
||||
|
@ -107,6 +119,9 @@
|
|||
BOOL need_insert_eot;
|
||||
int n_ctx;
|
||||
os_log_t os_log_inst;
|
||||
|
||||
GGMLThreadpool *threadpool;
|
||||
GGMLThreadpool *threadpool_batch;
|
||||
}
|
||||
|
||||
- (NSString *)chat_add_and_format:(std::vector<common_chat_msg> &) chat_msgs role:(const std::string &) role content:(const std::string &) content {
|
||||
|
@ -135,6 +150,7 @@ static BOOL file_is_empty(NSString *path) {
|
|||
self = [super init];
|
||||
self->_params = [params copy];
|
||||
self->_mutableLastOutput = [[NSMutableString alloc] init];
|
||||
self->_queue = [BlockingLineQueue new];
|
||||
if (params.logging) {
|
||||
os_log_inst = OS_LOG_DEFAULT;
|
||||
} else {
|
||||
|
@ -186,7 +202,6 @@ static BOOL file_is_empty(NSString *path) {
|
|||
|
||||
set_process_priority(ggml_sched_priority(params.cpuParams.priority));
|
||||
|
||||
GGMLThreadpool *threadpool_batch;
|
||||
if (tpp != tpp_batch) {
|
||||
threadpool_batch = [tpp_batch threadpool];
|
||||
if (!threadpool_batch) {
|
||||
|
@ -198,7 +213,7 @@ static BOOL file_is_empty(NSString *path) {
|
|||
tpp.paused = true;
|
||||
}
|
||||
|
||||
GGMLThreadpool *threadpool = [tpp threadpool];
|
||||
threadpool = [tpp threadpool];
|
||||
if (!threadpool) {
|
||||
[NSException raise:@"ThreadpoolFailure"
|
||||
format:@"threadpool create failed"];
|
||||
|
@ -299,28 +314,28 @@ static BOOL file_is_empty(NSString *path) {
|
|||
n_matching_session_tokens++;
|
||||
}
|
||||
if ([params.prompt length] == 0 && n_matching_session_tokens == embd_inp.size()) {
|
||||
// LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||
os_log_info(os_log_inst, "%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
// LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||
os_log_info(os_log_inst, "%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
// LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
// __func__, n_matching_session_tokens, embd_inp.size());
|
||||
os_log_error(os_log_inst, "%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
// LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
// __func__, n_matching_session_tokens, embd_inp.size());
|
||||
os_log_info(os_log_inst, "%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_seq_rm([self.ctx cContext], -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
//
|
||||
// os_log_debug(os_log_inst, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
// embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
//
|
||||
|
||||
os_log_debug(os_log_inst, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
|
||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||
// reevaluation of the last token to recalculate the cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||
// os_log_debug(os_log_inst, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||
os_log_debug(os_log_inst, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
}
|
||||
|
@ -344,37 +359,21 @@ static BOOL file_is_empty(NSString *path) {
|
|||
if (params.verbosePrompt) {
|
||||
os_log_info(os_log_inst,
|
||||
"%s: prompt: '%s'\n", __func__, [params.prompt cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
// LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
os_log_info(os_log_inst, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
os_log_info(os_log_inst, "%6d -> '%s'\n", embd_inp[i],
|
||||
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
if (params.nKeep > addBOS) {
|
||||
// LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
os_log_info(os_log_inst, "%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.nKeep; i++) {
|
||||
os_log_debug(os_log_inst, "%s",
|
||||
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
// // ctrl+C handling
|
||||
// {
|
||||
//#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
// struct sigaction sigint_action;
|
||||
// sigint_action.sa_handler = sigint_handler;
|
||||
// sigemptyset (&sigint_action.sa_mask);
|
||||
// sigint_action.sa_flags = 0;
|
||||
// sigaction(SIGINT, &sigint_action, NULL);
|
||||
//#elif defined (_WIN32)
|
||||
// auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
// return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
// };
|
||||
// SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
//#endif
|
||||
// }
|
||||
//
|
||||
|
||||
if (params.interactive) {
|
||||
os_log_info(os_log_inst, "%s: interactive mode on.\n", __func__);
|
||||
|
||||
|
@ -427,9 +426,8 @@ static BOOL file_is_empty(NSString *path) {
|
|||
os_log_info(os_log_inst, "sampler seed: %u\n", [_smpl seed]);
|
||||
// LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
// LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||
//
|
||||
// LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
//
|
||||
os_log_info(os_log_inst, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.nBatch, params.nPredict, params.nKeep);
|
||||
|
||||
// group-attention state
|
||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||
|
||||
|
@ -439,8 +437,8 @@ static BOOL file_is_empty(NSString *path) {
|
|||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
os_log_info(os_log_inst, "self-extend: n_ctx_train = %d, grp_attn_n = %ld, grp_attn_w = %ld\n", n_ctx_train, static_cast<long>(ga_n), static_cast<long>(ga_w));
|
||||
}
|
||||
|
||||
|
@ -493,7 +491,7 @@ static BOOL file_is_empty(NSString *path) {
|
|||
return [_mutableLastOutput copy];
|
||||
}
|
||||
|
||||
- (void)start:(BlockingLineQueue *)queue {
|
||||
- (void)start {
|
||||
while ((n_remain != 0 && !is_antiprompt) || _params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
|
@ -506,9 +504,7 @@ static BOOL file_is_empty(NSString *path) {
|
|||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
// console::set_display(console::error);
|
||||
os_log_error(os_log_inst, "<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
// console::set_display(console::reset);
|
||||
}
|
||||
|
||||
if (_params.grpAttnN == 1) {
|
||||
|
@ -626,8 +622,9 @@ static BOOL file_is_empty(NSString *path) {
|
|||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if ([pathSession length] > 0 && need_to_save_session && !_params.promptCacheRO) {
|
||||
need_to_save_session = false;
|
||||
[self.ctx saveStateFile:pathSession tokens:session_tokens.data() nTokenCount:session_tokens.size()];
|
||||
// llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
[self.ctx saveStateFile:pathSession
|
||||
tokens:session_tokens.data()
|
||||
nTokenCount:session_tokens.size()];
|
||||
|
||||
os_log_debug(os_log_inst, "saved session to %s\n", [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
@ -636,8 +633,6 @@ static BOOL file_is_empty(NSString *path) {
|
|||
|
||||
[_smpl accept:idToken acceptGrammar:true];
|
||||
|
||||
// os_log_debug(os_log_inst, "last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(idToken);
|
||||
|
||||
// echo this to console
|
||||
|
@ -666,7 +661,6 @@ static BOOL file_is_empty(NSString *path) {
|
|||
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
// std::cout<< "DISPLAYING TEXT" << std::endl;
|
||||
|
||||
for (auto idToken : embd) {
|
||||
NSString *token_str = [self.ctx tokenToPiece:idToken special:_params.special];
|
||||
|
@ -685,24 +679,13 @@ static BOOL file_is_empty(NSString *path) {
|
|||
output_tokens.push_back(idToken);
|
||||
output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
last_output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
NSLog(@"Generated %s", last_output_ss.str().c_str());
|
||||
[self willChangeValueForKey:@"lastOutput"];
|
||||
[_mutableLastOutput appendString:token_str];
|
||||
[self didChangeValueForKey:@"lastOutput"];
|
||||
}
|
||||
|
||||
}
|
||||
if (!last_output_ss.str().empty()) {
|
||||
// queue->addOutputLine(last_output_ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
if (!last_output_ss.str().empty()) {
|
||||
// queue->addOutputLine(last_output_ss.str());
|
||||
}
|
||||
// console::set_display(console::reset);
|
||||
display = true;
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
|
@ -769,7 +752,6 @@ static BOOL file_is_empty(NSString *path) {
|
|||
content:assistant_ss.str()];
|
||||
}
|
||||
isInteracting = true;
|
||||
// LOG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -797,39 +779,23 @@ static BOOL file_is_empty(NSString *path) {
|
|||
os_log_info(os_log_inst, "%s", [_params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
// color user input only
|
||||
// console::set_display(console::user_input);
|
||||
display = _params.displayPrompt;
|
||||
|
||||
std::string line;
|
||||
// bool another_line = true;
|
||||
static int read_one = 0;
|
||||
// if (!read_one) {
|
||||
// do {
|
||||
// another_line = false;// console::readline(line, params.multiline_input);
|
||||
// buffer += "What is the weather in New York?";//line;
|
||||
// } while (another_line);
|
||||
// read_one++;
|
||||
// }
|
||||
// else {
|
||||
|
||||
if (!last_output_ss.str().empty()) {
|
||||
auto str = last_output_ss.str();
|
||||
last_output_ss.str("");
|
||||
[queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]];
|
||||
[_queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]];
|
||||
[self willChangeValueForKey:@"lastOutput"];
|
||||
_mutableLastOutput = [[NSMutableString alloc] init];
|
||||
[self didChangeValueForKey:@"lastOutput"];
|
||||
|
||||
}
|
||||
|
||||
buffer = [[queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
// do {
|
||||
// another_line = console::readline(line, params.multiline_input);
|
||||
// buffer += line;
|
||||
// } while (another_line);
|
||||
// }
|
||||
// done taking input, reset color
|
||||
// console::set_display(console::reset);
|
||||
buffer = [[_queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
if ([_queue isClosed]) {
|
||||
break;
|
||||
}
|
||||
|
||||
display = true;
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
|
@ -881,7 +847,6 @@ static BOOL file_is_empty(NSString *path) {
|
|||
output_tokens.push_back(token);
|
||||
output_ss << [[self.ctx tokenToPiece:token] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
// reset assistant message
|
||||
assistant_ss.str("");
|
||||
|
||||
|
@ -915,6 +880,30 @@ static BOOL file_is_empty(NSString *path) {
|
|||
isInteracting = true;
|
||||
}
|
||||
}
|
||||
|
||||
NSLog(@"Loop over");
|
||||
[_queue addOutputLine:@""];
|
||||
}
|
||||
|
||||
- (void)stop {
|
||||
isInteracting = false;
|
||||
_params.interactive = false;
|
||||
_queue.isClosed = YES;
|
||||
[_queue addInputLine:@""];
|
||||
[_queue outputLine];
|
||||
}
|
||||
|
||||
- (void)dealloc
|
||||
{
|
||||
[_queue dealloc];
|
||||
[self.smpl dealloc];
|
||||
[self.ctx dealloc];
|
||||
[self.model dealloc];
|
||||
llama_backend_free();
|
||||
[threadpool dealloc];
|
||||
[threadpool_batch dealloc];
|
||||
|
||||
[super dealloc];
|
||||
}
|
||||
|
||||
@end
|
||||
|
|
|
@ -19,7 +19,6 @@ typedef int32_t LlamaToken;
|
|||
|
||||
@interface LlamaModel : NSObject
|
||||
|
||||
- (LlamaContext *)context:(LlamaContextParams *)params;
|
||||
- (LlamaToken)tokenBOS;
|
||||
- (LlamaToken)tokenEOT;
|
||||
- (LlamaToken)tokenEOS;
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
- (void)addOutputLine:(NSString *)line;
|
||||
- (NSString *)outputLine;
|
||||
|
||||
@property (nonatomic, assign) BOOL isClosed;
|
||||
|
||||
@end
|
||||
|
||||
NS_REFINED_FOR_SWIFT @interface LlamaSession : NSObject
|
||||
|
@ -19,9 +21,11 @@ NS_REFINED_FOR_SWIFT @interface LlamaSession : NSObject
|
|||
@property (nonatomic, strong) LlamaModel *model;
|
||||
@property (nonatomic, strong) LlamaContext *ctx;
|
||||
@property (nonatomic, strong, readonly) NSString *lastOutput;
|
||||
@property (nonatomic, strong) BlockingLineQueue *queue;
|
||||
|
||||
- (instancetype)initWithParams:(GPTParams *)params;
|
||||
- (void)start:(BlockingLineQueue *)queue;
|
||||
- (void)start;
|
||||
- (void)stop;
|
||||
|
||||
@end
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <type_traits>
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
|
||||
constexpr int kVecSize = 1 << 16;
|
||||
|
||||
|
@ -136,7 +137,7 @@ int main(int argc, char** argv) {
|
|||
|
||||
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
||||
|
||||
const auto * funcs = ggml_get_type_traits(ggml_type);
|
||||
const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
|
||||
|
||||
Stat simple, ggml;
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <array>
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
@ -236,7 +237,8 @@ int main(int argc, char** argv) {
|
|||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||
|
||||
const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
|
||||
const auto * funcs = ggml_get_type_traits(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
||||
const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
||||
|
||||
std::vector<block_q4_0> q40;
|
||||
std::vector<block_q4_1> q41;
|
||||
|
@ -282,10 +284,10 @@ int main(int argc, char** argv) {
|
|||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
else {
|
||||
const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type);
|
||||
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
||||
if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||
else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||
}
|
||||
sumq += result;
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
|
|
|
@ -1 +1 @@
|
|||
162e232411ee98ceb0cccfa84886118d917d2123
|
||||
a099cb514d6687e436a5a423d1fb0448be0feb20
|
||||
|
|
1
spm-headers/ggml-cpp.h
Symbolic link
1
spm-headers/ggml-cpp.h
Symbolic link
|
@ -0,0 +1 @@
|
|||
../ggml/include/ggml-cpp.h
|
1
spm-headers/ggml-cpu.h
Symbolic link
1
spm-headers/ggml-cpu.h
Symbolic link
|
@ -0,0 +1 @@
|
|||
../ggml/include/ggml-cpu.h
|
670
src/llama.cpp
670
src/llama.cpp
File diff suppressed because it is too large
Load diff
|
@ -2,100 +2,45 @@ import Foundation
|
|||
@_exported import JSONSchema
|
||||
@_exported import LlamaObjC
|
||||
|
||||
public protocol DynamicCallable: Sendable {
|
||||
@discardableResult
|
||||
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String
|
||||
}
|
||||
|
||||
public enum AnyDecodable: Decodable {
|
||||
case string(String)
|
||||
case int(Int)
|
||||
case double(Double)
|
||||
case bool(Bool)
|
||||
case null
|
||||
// Add other cases as needed
|
||||
|
||||
// Initializers for each type
|
||||
init(_ value: String) {
|
||||
self = .string(value)
|
||||
}
|
||||
|
||||
init(_ value: Int) {
|
||||
self = .int(value)
|
||||
}
|
||||
|
||||
init(_ value: Double) {
|
||||
self = .double(value)
|
||||
}
|
||||
|
||||
init(_ value: Bool) {
|
||||
self = .bool(value)
|
||||
}
|
||||
|
||||
init() {
|
||||
self = .null
|
||||
}
|
||||
|
||||
// Decodable conformance
|
||||
public init(from decoder: Decoder) throws {
|
||||
let container = try decoder.singleValueContainer()
|
||||
|
||||
if container.decodeNil() {
|
||||
self = .null
|
||||
} else if let intValue = try? container.decode(Int.self) {
|
||||
self = .int(intValue)
|
||||
} else if let doubleValue = try? container.decode(Double.self) {
|
||||
self = .double(doubleValue)
|
||||
} else if let boolValue = try? container.decode(Bool.self) {
|
||||
self = .bool(boolValue)
|
||||
} else if let stringValue = try? container.decode(String.self) {
|
||||
self = .string(stringValue)
|
||||
} else {
|
||||
let context = DecodingError.Context(
|
||||
codingPath: decoder.codingPath,
|
||||
debugDescription: "Cannot decode AnyDecodable"
|
||||
)
|
||||
throw DecodingError.typeMismatch(AnyDecodable.self, context)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ToolCall: Decodable {
|
||||
let id: Int
|
||||
let name: String
|
||||
let arguments: [String: AnyDecodable]
|
||||
}
|
||||
|
||||
struct ToolResponse<T: Encodable>: Encodable {
|
||||
let id: Int
|
||||
let result: T
|
||||
}
|
||||
|
||||
// MARK: LlamaChatSession
|
||||
/// Standard chat session for a given LLM.
|
||||
|
||||
/// An actor that manages a standard chat session with a given Large Language Model (LLM).
|
||||
///
|
||||
/// `LlamaChatSession` provides methods to interact with the LLM, allowing for synchronous
|
||||
/// and asynchronous message inference. It uses a `BlockingLineQueue` to manage the input
|
||||
/// and output lines for communication with the LLM backend.
|
||||
///
|
||||
/// ### Key Responsibilities:
|
||||
/// - Initialize the session with the specified parameters.
|
||||
/// - Send messages to the LLM and receive responses.
|
||||
/// - Provide an asynchronous stream for real-time inference results.
|
||||
public actor LlamaChatSession {
|
||||
private let queue = BlockingLineQueue()
|
||||
private let session: __LlamaSession
|
||||
/// The underlying session object interfacing with the LLM backend.
|
||||
internal let session: __LlamaSession
|
||||
|
||||
/// Initialize the session
|
||||
/// - parameter params: common parameters to initialize the session
|
||||
/// - parameter flush: whether or not to flush the initial prompt, reading initial output
|
||||
public init(params: GPTParams, flush: Bool = true) async throws {
|
||||
self.session = __LlamaSession(params: params)
|
||||
Task.detached { [session, queue] in
|
||||
session.start(queue)
|
||||
Task.detached { [session] in
|
||||
session.start()
|
||||
}
|
||||
|
||||
// flush
|
||||
guard flush else { return }
|
||||
_ = queue.outputLine()
|
||||
_ = session.queue.outputLine()
|
||||
}
|
||||
|
||||
/// Create a new inference stream for a given message
|
||||
/// - parameter message: The message to receive an inference for.
|
||||
/// - returns: A stream of output from the LLM.
|
||||
/// Creates an asynchronous stream for inference results based on a given message.
|
||||
///
|
||||
/// This method sends a message to the LLM and returns an `AsyncStream` that yields
|
||||
/// output as it becomes available, allowing for real-time streaming of inference results.
|
||||
///
|
||||
/// - Parameter message: The message to send to the LLM for inference.
|
||||
/// - Returns: An `AsyncStream` of `String` values representing incremental output from the LLM.
|
||||
public func inferenceStream(message: String) async -> AsyncStream<String> {
|
||||
queue.addInputLine(message)
|
||||
session.queue.addInputLine(message)
|
||||
var observationToken: NSKeyValueObservation?
|
||||
return AsyncStream { stream in
|
||||
observationToken = self.session.observe(\.lastOutput, options: [.new, .old]) { session, change in
|
||||
|
@ -119,17 +64,49 @@ public actor LlamaChatSession {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Sends a message to the LLM and returns the response.
|
||||
///
|
||||
/// This method sends a message to the LLM and waits for the complete response.
|
||||
///
|
||||
/// - Parameter message: The message to send to the LLM for inference.
|
||||
/// - Returns: A `String` containing the LLM's response to the message.
|
||||
public func infer(message: String) async -> String {
|
||||
queue.addInputLine(message)
|
||||
return queue.outputLine()
|
||||
session.queue.addInputLine(message)
|
||||
return session.queue.outputLine()
|
||||
}
|
||||
|
||||
deinit {
|
||||
session.stop()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// MARK: LlamaGrammarSession
|
||||
|
||||
/// An actor that manages a chat session with the LLM, enforcing a grammar defined by a JSON schema.
|
||||
///
|
||||
/// `LlamaSession` allows you to interact with the LLM while constraining the output to match a specified JSON schema.
|
||||
/// It uses the `JSONSchema` and `JSONSchemaConvertible` protocols to define the expected output format,
|
||||
/// and applies a grammar to the LLM's sampler parameters.
|
||||
///
|
||||
/// ### Key Responsibilities:
|
||||
/// - Initialize the session with a specified JSON schema, converting it to a grammar for the LLM.
|
||||
/// - Send messages to the LLM and decode the responses into a specified type `T`.
|
||||
///
|
||||
/// ### Type Parameters:
|
||||
/// - `T`: A type that conforms to `JSONSchemaConvertible` and represents the expected output format.
|
||||
public actor LlamaSession<T: JSONSchemaConvertible> {
|
||||
/// The underlying chat session used to interact with the LLM.
|
||||
private let session: LlamaChatSession
|
||||
|
||||
/// Initializes a new grammar-constrained session with the LLM.
|
||||
///
|
||||
/// - Parameters:
|
||||
/// - params: The parameters used to initialize the session, such as model settings.
|
||||
/// - flush: A boolean indicating whether to flush the initial prompt and read initial output.
|
||||
/// Defaults to `true`.
|
||||
/// - Throws: An error if the session fails to initialize.
|
||||
public init(params: GPTParams, flush: Bool = true) async throws {
|
||||
let converter = SchemaConverter(propOrder: [])
|
||||
_ = converter.visit(schema: T.jsonSchema, name: nil)
|
||||
|
@ -137,163 +114,16 @@ public actor LlamaSession<T: JSONSchemaConvertible> {
|
|||
session = try await LlamaChatSession(params: params, flush: flush)
|
||||
}
|
||||
|
||||
public func chat(message: String) async throws -> T {
|
||||
/// Sends a message to the LLM and decodes the response into type `T`.
|
||||
///
|
||||
/// This method sends a message to the LLM, receives the output, and attempts to decode it into the specified type `T`.
|
||||
/// It enforces that the LLM's output conforms to the JSON schema associated with `T`.
|
||||
///
|
||||
/// - Parameter message: The message to send to the LLM.
|
||||
/// - Returns: An instance of type `T` decoded from the LLM's response.
|
||||
/// - Throws: An error if the decoding fails.
|
||||
public func infer(message: String) async throws -> T {
|
||||
let output = await session.infer(message: message).data(using: .utf8)!
|
||||
return try JSONDecoder().decode(T.self, from: output)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: LlamaToolSession
|
||||
public actor LlamaToolSession {
|
||||
private let session: LlamaChatSession
|
||||
|
||||
private struct GetIpAddress: DynamicCallable {
|
||||
func dynamicallyCall(withKeywordArguments args: [String : Any]) async throws -> String {
|
||||
getIPAddress()
|
||||
}
|
||||
}
|
||||
|
||||
internal static func getIPAddress() -> String {
|
||||
var address: String!
|
||||
|
||||
// Get list of all interfaces on the local machine:
|
||||
var ifaddr: UnsafeMutablePointer<ifaddrs>? = nil
|
||||
if getifaddrs(&ifaddr) == 0 {
|
||||
// Loop through linked list of interfaces
|
||||
var ptr = ifaddr
|
||||
while ptr != nil {
|
||||
let interface = ptr!.pointee
|
||||
|
||||
// Check if the interface is IPv4 or IPv6:
|
||||
let addrFamily = interface.ifa_addr.pointee.sa_family
|
||||
if addrFamily == UInt8(AF_INET) || addrFamily == UInt8(AF_INET6) {
|
||||
|
||||
// Convert interface name to String:
|
||||
let name = String(cString: interface.ifa_name)
|
||||
|
||||
// Only consider non-loopback interfaces (e.g., "en0" for Wi-Fi)
|
||||
if name == "en0" { // Typically en0 is the Wi-Fi interface
|
||||
// Convert the address to a readable format:
|
||||
var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
|
||||
if getnameinfo(interface.ifa_addr, socklen_t(interface.ifa_addr.pointee.sa_len),
|
||||
&hostname, socklen_t(hostname.count),
|
||||
nil, socklen_t(0), NI_NUMERICHOST) == 0 {
|
||||
address = String(cString: hostname)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ptr = interface.ifa_next
|
||||
}
|
||||
|
||||
freeifaddrs(ifaddr)
|
||||
}
|
||||
|
||||
return address
|
||||
}
|
||||
|
||||
public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)]
|
||||
|
||||
public init(params: GPTParams,
|
||||
tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws {
|
||||
self.tools = tools
|
||||
let ipFnSchema = _JSONFunctionSchema(name: "getIpAddress", description: "Get the IP Address for this system", parameters: _JSONFunctionSchema.Parameters(properties: [:], required: []))
|
||||
self.tools["getIpAddress"] = (GetIpAddress(), ipFnSchema)
|
||||
let encoded = try JSONEncoder().encode(self.tools.values.map(\.1))
|
||||
let prompt = """
|
||||
\(params.prompt ?? "")
|
||||
|
||||
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"name": <function-name>,"arguments": <args-dict>}
|
||||
</tool_call>
|
||||
|
||||
Feel free to chain tool calls, e.g., if you need the user's location to find points of interest near them, fetch the user's location first.
|
||||
The first call you will be asked to warm up is to get the user's IP address. Here are the available tools:
|
||||
<tools> \(String(data: encoded, encoding: .utf8)!) </tools><|eot_id|>
|
||||
"""
|
||||
params.prompt = prompt
|
||||
params.interactive = true
|
||||
params.antiPrompts.append("<|eot_id|>");
|
||||
params.inputPrefix = "<|start_header_id|>user<|end_header_id|>";
|
||||
params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
|
||||
session = try await LlamaChatSession(params: params, flush: false)
|
||||
let fn = await session.infer(message: "What is my IP address?")
|
||||
let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
|
||||
guard let tool = self.tools[toolCall.name] else {
|
||||
fatalError()
|
||||
}
|
||||
let resp = try await tool.0.dynamicallyCall(withKeywordArguments: toolCall.arguments)
|
||||
print(resp)
|
||||
|
||||
let output = await session.infer(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: \(resp)}
|
||||
</tool_response>
|
||||
""")
|
||||
print(output)
|
||||
}
|
||||
|
||||
private func callTool(_ call: String) async -> String? {
|
||||
var nxt: String?
|
||||
do {
|
||||
let toolCall = try JSONDecoder().decode(ToolCall.self, from: call.data(using: .utf8)!)
|
||||
guard let tool = tools[toolCall.name] else {
|
||||
fatalError()
|
||||
}
|
||||
// TODO: tool call decode is allowed to fail but the code below is not
|
||||
let callable = tool.0
|
||||
|
||||
do {
|
||||
let response = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments)
|
||||
print("tool response: \(response)")
|
||||
nxt = await session.infer(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: \(response)}
|
||||
</tool_response>
|
||||
""")
|
||||
// TODO: If this decodes correctly, we should tail this into this method
|
||||
// TODO: so that we do not decode twice
|
||||
if let _ = try? JSONDecoder().decode(ToolCall.self, from: nxt!.data(using: .utf8)!) {
|
||||
return await callTool(nxt!)
|
||||
}
|
||||
} catch {
|
||||
nxt = await session.infer(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: "The tool call has unfortunately failed."}
|
||||
</tool_response>
|
||||
""")
|
||||
}
|
||||
print(nxt ?? "nil")
|
||||
} catch {}
|
||||
return nxt
|
||||
}
|
||||
|
||||
public func infer(message: String) async throws -> String {
|
||||
let output = await session.infer(message: message)
|
||||
guard let output = await callTool(output) else {
|
||||
return output
|
||||
}
|
||||
return output
|
||||
}
|
||||
}
|
||||
|
||||
public protocol LlamaActor: Actor {
|
||||
static func tools(_ self: Self) -> [String: (DynamicCallable, _JSONFunctionSchema)]
|
||||
var session: LlamaToolSession! { get }
|
||||
}
|
||||
|
||||
public extension LlamaActor {
|
||||
func chat(_ message: String) async throws -> String {
|
||||
try await session.infer(message: message)
|
||||
}
|
||||
}
|
||||
|
||||
@attached(member, names: arbitrary)
|
||||
@attached(extension, conformances: LlamaActor, names: arbitrary)
|
||||
public macro llamaActor() = #externalMacro(module: "LlamaKitMacros",
|
||||
type: "LlamaActorMacro")
|
||||
|
||||
@attached(body)
|
||||
public macro Tool() = #externalMacro(module: "LlamaKitMacros",
|
||||
type: "ToolMacro")
|
||||
|
|
402
swift/LlamaKit/LlamaToolSession.swift
Normal file
402
swift/LlamaKit/LlamaToolSession.swift
Normal file
|
@ -0,0 +1,402 @@
|
|||
import Foundation
|
||||
|
||||
|
||||
public protocol DynamicCallable: Sendable {
|
||||
@discardableResult
|
||||
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String
|
||||
}
|
||||
|
||||
public enum AnyDecodable: Decodable {
|
||||
case string(String)
|
||||
case int(Int)
|
||||
case double(Double)
|
||||
case bool(Bool)
|
||||
case null
|
||||
// Add other cases as needed
|
||||
|
||||
// Initializers for each type
|
||||
init(_ value: String) {
|
||||
self = .string(value)
|
||||
}
|
||||
|
||||
init(_ value: Int) {
|
||||
self = .int(value)
|
||||
}
|
||||
|
||||
init(_ value: Double) {
|
||||
self = .double(value)
|
||||
}
|
||||
|
||||
init(_ value: Bool) {
|
||||
self = .bool(value)
|
||||
}
|
||||
|
||||
init() {
|
||||
self = .null
|
||||
}
|
||||
|
||||
// Decodable conformance
|
||||
public init(from decoder: Decoder) throws {
|
||||
let container = try decoder.singleValueContainer()
|
||||
|
||||
if container.decodeNil() {
|
||||
self = .null
|
||||
} else if let intValue = try? container.decode(Int.self) {
|
||||
self = .int(intValue)
|
||||
} else if let doubleValue = try? container.decode(Double.self) {
|
||||
self = .double(doubleValue)
|
||||
} else if let boolValue = try? container.decode(Bool.self) {
|
||||
self = .bool(boolValue)
|
||||
} else if let stringValue = try? container.decode(String.self) {
|
||||
self = .string(stringValue)
|
||||
} else {
|
||||
let context = DecodingError.Context(
|
||||
codingPath: decoder.codingPath,
|
||||
debugDescription: "Cannot decode AnyDecodable"
|
||||
)
|
||||
throw DecodingError.typeMismatch(AnyDecodable.self, context)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ToolCall: Decodable {
|
||||
let id: Int
|
||||
let name: String
|
||||
let arguments: [String: AnyDecodable]
|
||||
}
|
||||
|
||||
struct ToolResponse<T: Encodable>: Encodable {
|
||||
let id: Int
|
||||
let result: T
|
||||
}
|
||||
|
||||
// MARK: LlamaToolSession
|
||||
/// An actor that manages tool calls within a chat session using Llama language models.
|
||||
///
|
||||
/// `LlamaToolSession` is responsible for handling interactions between the language model and dynamically callable tools.
|
||||
/// It maintains a dictionary of tools that can be invoked based on the language model's output.
|
||||
/// With the integration of the `@llamaActor` and `@Tool` macros, users can easily define custom tools within their own actors,
|
||||
/// which are then registered and managed by `LlamaToolSession`.
|
||||
///
|
||||
/// ### Key Responsibilities:
|
||||
/// - Initializes the language model session with the provided tools and prompt.
|
||||
/// - Parses the language model's output to detect tool calls and invokes the corresponding tools.
|
||||
/// - Sends tool responses back to the language model to generate coherent replies.
|
||||
/// - Supports both synchronous and asynchronous inference, including streaming results.
|
||||
///
|
||||
/// ### Integration with Macros:
|
||||
/// - The `@llamaActor` macro simplifies the creation of actors with tools by automatically generating the necessary code to integrate with `LlamaToolSession`.
|
||||
/// - The `@Tool` macro allows functions within the actor to be marked as tools, which are registered and made available to the language model.
|
||||
///
|
||||
/// ### Usage:
|
||||
/// ```swift
|
||||
/// @llamaActor actor MyLlama {
|
||||
/// /// Gets the user's favorite season.
|
||||
/// @Tool public func getFavoriteSeason() async throws -> String {
|
||||
/// return "autumn"
|
||||
/// }
|
||||
///
|
||||
/// /// Gets the user's favorite animal.
|
||||
/// @Tool public func getFavoriteAnimal() async throws -> String {
|
||||
/// return "cat"
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// let params = GPTParams(prompt: "Your initial prompt")
|
||||
/// let myLlama = try await MyLlama(params: params)
|
||||
/// let response = try await myLlama.session.infer(message: "What is my favorite season?")
|
||||
/// print(response)
|
||||
/// ```
|
||||
///
|
||||
/// ### Architecture Diagram:
|
||||
/// ```
|
||||
/// +-------------------+
|
||||
/// | MyLlama Actor |
|
||||
/// | (User-Defined) |
|
||||
/// | |
|
||||
/// | @llamaActor |
|
||||
/// | +---------------+ |
|
||||
/// | | getFavorite...|<------------------------------+
|
||||
/// | | (Tool funcs) | |
|
||||
/// +---------+-------+ |
|
||||
/// | |
|
||||
/// | Uses |
|
||||
/// v |
|
||||
/// +-------------------+ +-----------------+ +------------------+
|
||||
/// | LlamaToolSession | | LlamaChatSession| | __LlamaSession |
|
||||
/// | (Actor) | | (Actor) | | (Objective-C) |
|
||||
/// +---------+---------+ +---------+-------+ +---------+--------+
|
||||
/// | ^ ^
|
||||
/// | | |
|
||||
/// | | Observes |
|
||||
/// v | |
|
||||
/// +-------------------+ +-----------------+ +------------------+
|
||||
/// | BlockingLineQueue |---------->| LlamaSession |<------| Tools (Macros) |
|
||||
/// | (Objective-C) | Input/ | (Objective-C) | | (DynamicCallable)|
|
||||
/// +-------------------+ Output +----------------+ +------------------+
|
||||
/// ```
|
||||
///
|
||||
/// ### Notes:
|
||||
/// - **Beta Feature**: This feature is currently in beta and may undergo significant changes.
|
||||
/// - **Template Format**: `LlamaToolSession` currently only works with the llama chat template format.
|
||||
/// - **Thread Safety**: `LlamaToolSession` is an actor to ensure thread-safe operations in a concurrent environment.
|
||||
/// - **Extensibility**: Users can define custom tools using the `@Tool` macro within an actor annotated with `@llamaActor`.
|
||||
/// - **Error Handling**: The session handles tool call failures gracefully, allowing the language model to continue processing.
|
||||
///
|
||||
/// ### See Also:
|
||||
/// - `LlamaChatSession`
|
||||
/// - `@llamaActor` Macro
|
||||
/// - `@Tool` Macro
|
||||
public actor LlamaToolSession {
|
||||
private let session: LlamaChatSession
|
||||
|
||||
public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)]
|
||||
|
||||
public init(params: GPTParams,
|
||||
tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws {
|
||||
self.tools = tools
|
||||
let encoded = try JSONEncoder().encode(self.tools.values.map(\.1))
|
||||
let prompt = """
|
||||
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
|
||||
\(params.prompt ?? "")
|
||||
|
||||
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"name": <function-name>,"arguments": <args-dict>}
|
||||
</tool_call>
|
||||
|
||||
Feel free to chain tool calls, e.g., if you need the user's location to find points of interest near them, fetch the user's location first.
|
||||
|
||||
<tools> \(String(data: encoded, encoding: .utf8)!) </tools>
|
||||
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
"""
|
||||
params.prompt = prompt
|
||||
params.interactive = true
|
||||
params.antiPrompts.append("<|eot_id|>")
|
||||
params.inputPrefix = "<|start_header_id|>user<|end_header_id|>"
|
||||
params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
|
||||
session = try await LlamaChatSession(params: params, flush: false)
|
||||
guard let line = await session.session.queue.outputLine() else {
|
||||
return
|
||||
}
|
||||
_ = await callTool(line)
|
||||
}
|
||||
|
||||
private func callTool(_ call: ToolCall) async throws -> String {
|
||||
guard let tool = tools[call.name] else {
|
||||
fatalError()
|
||||
}
|
||||
// TODO: tool call decode is allowed to fail but the code below is not
|
||||
let callable = tool.0
|
||||
|
||||
return try await callable.dynamicallyCall(withKeywordArguments: call.arguments)
|
||||
}
|
||||
|
||||
private func callTool(_ call: String) async -> String? {
|
||||
var nxt: String?
|
||||
do {
|
||||
let toolCall = try JSONDecoder().decode(ToolCall.self, from: call.data(using: .utf8)!)
|
||||
guard let tool = tools[toolCall.name] else {
|
||||
fatalError()
|
||||
}
|
||||
// TODO: tool call decode is allowed to fail but the code below is not
|
||||
let callable = tool.0
|
||||
|
||||
do {
|
||||
let response = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments)
|
||||
print("tool response: \(response)")
|
||||
nxt = await session.infer(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: \(response)}
|
||||
</tool_response>
|
||||
""")
|
||||
// TODO: If this decodes correctly, we should tail this into this method
|
||||
// TODO: so that we do not decode twice
|
||||
if let _ = try? JSONDecoder().decode(ToolCall.self, from: nxt!.data(using: .utf8)!) {
|
||||
return await callTool(nxt!)
|
||||
}
|
||||
} catch {
|
||||
nxt = await session.infer(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: "The tool call has unfortunately failed."}
|
||||
</tool_response>
|
||||
""")
|
||||
}
|
||||
print(nxt ?? "nil")
|
||||
} catch {}
|
||||
return nxt
|
||||
}
|
||||
|
||||
public func infer(message: String) async throws -> String {
|
||||
let output = await session.infer(message: message)
|
||||
guard let output = await callTool(output) else {
|
||||
return output
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
public func inferenceStream(message: String) async -> AsyncStream<String> {
|
||||
let underlyingSession = await session.session
|
||||
await session.session.queue.addInputLine(message)
|
||||
var observationToken: NSKeyValueObservation?
|
||||
actor StreamProcessor {
|
||||
private(set) var buffer = ""
|
||||
private(set) var totalBuffer = ""
|
||||
private(set) var isProcessingToolCall = false
|
||||
|
||||
func setBuffer(_ value: String) {
|
||||
buffer = value
|
||||
}
|
||||
func appendBuffer(_ value: String) {
|
||||
buffer.append(value)
|
||||
totalBuffer.append(value)
|
||||
}
|
||||
func setTotalBuffer(_ value: String) {
|
||||
totalBuffer = value
|
||||
}
|
||||
func setIsProcessingToolCall(_ value: Bool) {
|
||||
isProcessingToolCall = value
|
||||
}
|
||||
}
|
||||
|
||||
let streamProcessor = StreamProcessor()
|
||||
return AsyncStream { continuation in
|
||||
observationToken = underlyingSession.observe(\.lastOutput, options: [.new, .old]) { session, change in
|
||||
Task {
|
||||
guard let newValue = change.newValue,
|
||||
let oldValue = change.oldValue else {
|
||||
continuation.finish()
|
||||
return
|
||||
}
|
||||
var delta = ""
|
||||
for change in newValue!.difference(from: oldValue!) {
|
||||
switch change {
|
||||
case .remove(_, _, _):
|
||||
if await streamProcessor.isProcessingToolCall {
|
||||
let call = try await JSONDecoder().decode(ToolCall.self, from: streamProcessor.totalBuffer.data(using: .utf8)!)
|
||||
await streamProcessor.setBuffer("")
|
||||
await streamProcessor.setTotalBuffer("")
|
||||
let response = try await self.callTool(call)
|
||||
return await self.session.session.queue.addInputLine("""
|
||||
<tool_response>
|
||||
{"id": \(call.id), result: \(response)}
|
||||
</tool_response>
|
||||
""")
|
||||
} else {
|
||||
continuation.finish()
|
||||
}
|
||||
return
|
||||
case .insert(_, let element, _):
|
||||
delta.append(element)
|
||||
}
|
||||
}
|
||||
await streamProcessor.appendBuffer(delta)
|
||||
// Check if buffer contains a complete tool call
|
||||
if await streamProcessor.totalBuffer.starts(with: "\n\n\n{") {
|
||||
await streamProcessor.setIsProcessingToolCall(true)
|
||||
} else {
|
||||
await streamProcessor.setIsProcessingToolCall(false)
|
||||
// If not a tool call, yield the delta
|
||||
continuation.yield(delta)
|
||||
await streamProcessor.setBuffer("")
|
||||
}
|
||||
// Else, continue buffering until we have a complete tool call
|
||||
}
|
||||
}
|
||||
continuation.onTermination = { [observationToken] _ in
|
||||
observationToken?.invalidate()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public protocol LlamaActor: Actor {
|
||||
static func tools(_ self: Self) -> [String: (DynamicCallable, _JSONFunctionSchema)]
|
||||
var session: LlamaToolSession! { get }
|
||||
|
||||
}
|
||||
|
||||
public extension LlamaActor {
|
||||
func infer(_ message: String) async throws -> String {
|
||||
try await session.infer(message: message)
|
||||
}
|
||||
func inferenceStream(message: String) async -> AsyncStream<String> {
|
||||
await session.inferenceStream(message: message)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: @llamaActor Macro
|
||||
/// A macro that transforms an actor into a Llama tool actor, automatically integrating with `LlamaToolSession`.
|
||||
///
|
||||
/// The `@llamaActor` macro simplifies the process of creating an actor with tools that can be used by the Llama language model.
|
||||
/// It automatically generates the necessary code to initialize a `LlamaToolSession`, register the tools, and conform to the `LlamaActor` protocol.
|
||||
///
|
||||
/// ### Usage:
|
||||
/// ```swift
|
||||
/// @llamaActor actor MyLlama {
|
||||
/// /// Gets the user's favorite season.
|
||||
/// @Tool public func getFavoriteSeason() async throws -> String {
|
||||
/// return "autumn"
|
||||
/// }
|
||||
///
|
||||
/// /// Gets the user's favorite animal.
|
||||
/// @Tool public func getFavoriteAnimal() async throws -> String {
|
||||
/// return "cat"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// ### Macro Details:
|
||||
/// - **Attached To**: Actor declarations.
|
||||
/// - **Produces**:
|
||||
/// - Member variables and initializers required for tool integration.
|
||||
/// - An extension conforming the actor to `LlamaActor`, providing necessary functionalities.
|
||||
/// - **Parameters**: None.
|
||||
///
|
||||
/// ### Notes:
|
||||
/// - The macro processes functions marked with `@Tool` within the actor to generate dynamic callable tools.
|
||||
/// - It collects the tools and their schemas to register them with `LlamaToolSession`.
|
||||
///
|
||||
/// ### See Also:
|
||||
/// - `@Tool` Macro
|
||||
/// - `LlamaToolSession`
|
||||
@attached(member, names: arbitrary)
|
||||
@attached(extension, conformances: LlamaActor, names: arbitrary)
|
||||
public macro llamaActor() = #externalMacro(module: "LlamaKitMacros",
|
||||
type: "LlamaActorMacro")
|
||||
|
||||
/// A macro that marks a function within an actor as a tool callable by the Llama language model.
|
||||
///
|
||||
/// The `@Tool` macro indicates that a function should be exposed as a tool to the language model.
|
||||
/// It processes the function to generate a dynamically callable structure, registers it with `LlamaToolSession`,
|
||||
/// and includes the tool's metadata in the model's prompt.
|
||||
///
|
||||
/// ### Usage:
|
||||
/// ```swift
|
||||
/// @llamaActor actor MyLlama {
|
||||
/// /// Gets the user's favorite animal.
|
||||
/// @Tool public func getFavoriteAnimal() async throws -> String {
|
||||
/// return "cat"
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// ### Macro Details:
|
||||
/// - **Attached To**: Function declarations within an actor marked with `@llamaActor`.
|
||||
/// - **Produces**:
|
||||
/// - A dynamically callable structure that wraps the function.
|
||||
/// - Registers the tool with its name, description, and parameters.
|
||||
/// - **Parameters**: None.
|
||||
///
|
||||
/// ### Notes:
|
||||
/// - The function's documentation comment is used as the tool's description.
|
||||
/// - Parameter comments are used to describe the tool's parameters.
|
||||
/// - Supports functions with parameters and return values that conform to `Codable`.
|
||||
///
|
||||
/// ### See Also:
|
||||
/// - `@llamaActor` Macro
|
||||
/// - `LlamaToolSession`
|
||||
@attached(body)
|
||||
public macro Tool() = #externalMacro(module: "LlamaKitMacros",
|
||||
type: "ToolMacro")
|
|
@ -57,7 +57,7 @@ while true {
|
|||
break
|
||||
} else {
|
||||
print("🧔🏽♂️: \(userInput)")
|
||||
let response = try await llama.chat(userInput)
|
||||
let response = try await llama.infer(userInput)
|
||||
print("🤖: \(response)")
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -4,6 +4,18 @@ import Testing
|
|||
import JSONSchema
|
||||
import OSLog
|
||||
|
||||
@llamaActor actor MyLlama {
|
||||
/// Get the user's favorite season
|
||||
@Tool public func getFavoriteSeason() async throws -> String {
|
||||
return "autumn"
|
||||
}
|
||||
|
||||
/// Get the user's favorite animal.
|
||||
@Tool public func getFavoriteAnimal() async throws -> String {
|
||||
return "cat"
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: LlamaGrammarSession Suite
|
||||
@Suite("LlamaSession Suite")
|
||||
struct LlamaSessionSuite {
|
||||
|
@ -43,7 +55,7 @@ struct LlamaSessionSuite {
|
|||
func baseParams(url: String, to path: String) async throws -> GPTParams {
|
||||
let params = GPTParams()
|
||||
params.modelPath = try await downloadFile(url: url, to: path)
|
||||
params.nPredict = 512
|
||||
params.nPredict = 4096
|
||||
params.nCtx = 4096
|
||||
params.cpuParams.nThreads = 8
|
||||
params.cpuParamsBatch.nThreads = 8
|
||||
|
@ -89,7 +101,7 @@ struct LlamaSessionSuite {
|
|||
"""
|
||||
let session = try await LlamaSession<Trip>(params: params)
|
||||
await #expect(throws: Never.self) {
|
||||
let trip = try await session.chat(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.")
|
||||
let trip = try await session.infer(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.")
|
||||
#expect(trip.location.contains("New York"))
|
||||
// TODO: Testing the other fields is difficult considering model size
|
||||
// TODO: so for now, we are just asserting the grammar works
|
||||
|
@ -100,83 +112,93 @@ struct LlamaSessionSuite {
|
|||
let isSpellingCorrect: Bool
|
||||
}
|
||||
|
||||
// MARK: Grammar Test
|
||||
@Test func llamaSimpleGrammarSession() async throws {
|
||||
let params = try await baseParams(url: "https://huggingface.co/RichardErkhov/openfoodfacts_-_spellcheck-mistral-7b-gguf/resolve/main/spellcheck-mistral-7b.Q8_0.gguf?download=true",
|
||||
to: "spellcheck_q8.gguf")
|
||||
let params = try await baseParams(url: "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q5_K_S.gguf?download=true",
|
||||
to: "spellcheck_mistral.gguf")
|
||||
params.prompt = """
|
||||
###You are a spell checker. I will provide you with the word 'strawberry'. If the spelling of the given word is correct, please respond {"isCorrect": true} else respond {"isCorrect": false}.\n
|
||||
You are a spell checker. I will provide you with versions of the word 'strawberry'. Tell me if the spelling is correct or not.
|
||||
"""
|
||||
params.inputPrefix = "<s>[INST]"
|
||||
params.inputSuffix = "[/INST]"
|
||||
params.antiPrompts.append("</s>")
|
||||
let session = try await LlamaSession<IsCorrect>(params: params)
|
||||
for _ in 0..<10 {
|
||||
var output = try await session.chat(message: "###strawberry\n")
|
||||
#expect(output.isSpellingCorrect)
|
||||
output = try await session.chat(message: "###strawberrry\n")
|
||||
#expect(!output.isSpellingCorrect)
|
||||
var output = try await session.infer(message: "strawberry\n")
|
||||
#expect(output.isSpellingCorrect)
|
||||
output = try await session.infer(message: "st44rawberrry\n")
|
||||
#expect(!output.isSpellingCorrect)
|
||||
}
|
||||
|
||||
// MARK: Tool Test
|
||||
@Test func llamaToolSession() async throws {
|
||||
let params = try await baseParams(url: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q8_0.gguf?download=true", to: "llama_tools.gguf")
|
||||
let fm = FileManager.default
|
||||
let tmpDir = fm.temporaryDirectory
|
||||
let cacheURL = tmpDir.appending(path: "cache")
|
||||
params.pathPromptCache = cacheURL.path()
|
||||
params.logging = true
|
||||
defer { try? fm.removeItem(at: cacheURL) }
|
||||
let llama = try await MyLlama(params: params)
|
||||
var output = try await llama.infer("What's my favorite animal?")
|
||||
#expect(output.contains("cat"))
|
||||
output = try await llama.infer("What's my favorite season?")
|
||||
#expect(output.contains("autumn"))
|
||||
}
|
||||
|
||||
// MARK: Session dealloc Test
|
||||
@Test func llamaToolSessionDealloc() async throws {
|
||||
let params = try await baseParams(url: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q8_0.gguf?download=true", to: "llama_tools.gguf")
|
||||
func reportMemoryUsage() -> UInt64? {
|
||||
var info = mach_task_basic_info()
|
||||
var count = mach_msg_type_number_t(MemoryLayout.size(ofValue: info)) / 4
|
||||
|
||||
let kerr = withUnsafeMutablePointer(to: &info) {
|
||||
$0.withMemoryRebound(to: integer_t.self, capacity: 1) {
|
||||
task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count)
|
||||
}
|
||||
}
|
||||
|
||||
guard kerr == KERN_SUCCESS else {
|
||||
print("Error with task_info(): \(kerr)")
|
||||
return nil
|
||||
}
|
||||
|
||||
return info.resident_size // Memory in bytes
|
||||
}
|
||||
var memPostAlloc: UInt64!
|
||||
try await Task {
|
||||
let llama = try await MyLlama(params: params)
|
||||
memPostAlloc = reportMemoryUsage()! / 1024 / 1024
|
||||
#expect(memPostAlloc > 500) // we are estimating here
|
||||
var output = try await llama.infer("What's my favorite animal?")
|
||||
print(output)
|
||||
output = try await llama.infer("What question did i just ask you?")
|
||||
print(output)
|
||||
}.value
|
||||
sleep(1)
|
||||
var memDealloc = reportMemoryUsage()! / 1024 / 1024
|
||||
#expect(memDealloc < 200)
|
||||
try await Task {
|
||||
let llama = try await MyLlama(params: params)
|
||||
memPostAlloc = reportMemoryUsage()! / 1024 / 1024
|
||||
#expect(memPostAlloc > 500)
|
||||
_ = try await llama.infer("What was the first question I asked you?")
|
||||
}.value
|
||||
sleep(1)
|
||||
memDealloc = reportMemoryUsage()! / 1024 / 1024
|
||||
#expect(memDealloc < 200)
|
||||
}
|
||||
|
||||
// MARK: Stream Test
|
||||
@Test func llamaToolSessionStream() async throws {
|
||||
let params = try await baseParams(url: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q8_0.gguf?download=true", to: "llama_tools.gguf")
|
||||
params.prompt = "You are an AI tool assistant that can chain tool calls together."
|
||||
let llama = try await MyLlama(params: params)
|
||||
var buffer = ""
|
||||
for await output in await llama.inferenceStream(message: "What's my favorite animal and season?") {
|
||||
buffer += output
|
||||
print(output, terminator: "")
|
||||
}
|
||||
#expect(buffer.contains("cat") && buffer.contains("autumn"))
|
||||
}
|
||||
}
|
||||
|
||||
import WeatherKit
|
||||
import CoreLocation
|
||||
|
||||
func downloadFile() async throws -> String {
|
||||
let fm = FileManager.default
|
||||
let tmpDir = fm.temporaryDirectory
|
||||
let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
|
||||
|
||||
guard !fm.fileExists(atPath: destinationURL.path()) else {
|
||||
return destinationURL.path()
|
||||
}
|
||||
print("Downloading Llama Tools, this may take a while...")
|
||||
// Define the URL
|
||||
guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
|
||||
print("Invalid URL.")
|
||||
throw URLError(.badURL)
|
||||
}
|
||||
|
||||
// Start the async download
|
||||
let (tempURL, _) = try await URLSession.shared.download(from: url)
|
||||
|
||||
// Define the destination path in the documents directory
|
||||
|
||||
|
||||
// Move the downloaded file to the destination
|
||||
try fm.moveItem(at: tempURL, to: destinationURL)
|
||||
print("File downloaded to: \(destinationURL.path())")
|
||||
return destinationURL.path()
|
||||
}
|
||||
|
||||
|
||||
@llamaActor actor MyLlama {
|
||||
struct CurrentWeather: Codable {
|
||||
let temperature: Double
|
||||
let condition: WeatherCondition
|
||||
}
|
||||
|
||||
/// Get the current weather in a given location.
|
||||
/// - parameter location: The city and state, e.g. San Francisco, CA
|
||||
/// - parameter unit: The unit of temperature
|
||||
@Tool public func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
|
||||
let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
|
||||
var temperature = weather.currentWeather.temperature
|
||||
temperature.convert(to: .fahrenheit)
|
||||
return CurrentWeather(temperature: temperature.value,
|
||||
condition: weather.currentWeather.condition)
|
||||
}
|
||||
}
|
||||
|
||||
@Test func llamaToolSession() async throws {
|
||||
let params = GPTParams()
|
||||
params.modelPath = try await downloadFile()
|
||||
params.nPredict = 512
|
||||
params.nCtx = 4096
|
||||
params.cpuParams.nThreads = 8
|
||||
params.cpuParamsBatch.nThreads = 8
|
||||
params.nBatch = 1024
|
||||
params.nGpuLayers = 1024
|
||||
let llama = try await MyLlama(params: params)
|
||||
let currentWeather = try await llama.getCurrentWeather(location: "San Francisco, CA", unit: "farenheit")
|
||||
let output = try await llama.chat("What's the weather (in farenheit) in San Francisco, CA?")
|
||||
#expect(output.contains(String(format: "%d", Int(currentWeather.temperature))))
|
||||
// #expect(output.contains(currentWeather.condition.rawValue))
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <chrono>
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
|
@ -78,18 +79,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|||
|
||||
// Total dot product error
|
||||
static float dot_product_error(
|
||||
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
|
||||
qfns->from_float(test_data1, tmp_q1.data(), test_size);
|
||||
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
|
||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||
|
||||
|
@ -132,6 +133,7 @@ int main(int argc, char * argv[]) {
|
|||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
||||
|
||||
// deprecated - skip
|
||||
if (qfns->blck_size == 0) {
|
||||
|
@ -166,7 +168,7 @@ int main(int argc, char * argv[]) {
|
|||
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
||||
}
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Benchmark quantization specific functions on synthetic data
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <algorithm>
|
||||
|
@ -271,6 +272,7 @@ int main(int argc, char * argv[]) {
|
|||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -328,7 +330,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
vdot->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
|
@ -346,7 +348,7 @@ int main(int argc, char * argv[]) {
|
|||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
float result;
|
||||
qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue