Merge branch 'ggerganov:master' into gguf-model-template

This commit is contained in:
Austin 2024-07-26 00:06:35 -04:00 committed by GitHub
commit 43eef8d287
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
91 changed files with 5818 additions and 8884 deletions

View file

@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \ echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \ fi && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ echo "Building with static libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

View file

@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \ echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \ fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server cmake --build build --config Release --target llama-server

View file

@ -10,7 +10,6 @@
"llama-embedding" "llama-embedding"
"llama-server" "llama-server"
"llama-quantize" "llama-quantize"
"llama-train-text-from-scratch"
]; ];
mkApp = name: { mkApp = name: {
type = "app"; type = "app";

View file

@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@" ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@" ./llama-cli "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./llama-finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -36,8 +34,6 @@ else
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
echo " See documentation for finetune for command-line parameters"
echo " --all-in-one (-a): Execute --convert & --quantize" echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B" echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server" echo " --server (-s): Run a model on the server"

View file

@ -860,7 +860,7 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
- name: Determine tag name - name: Determine tag name
id: tag id: tag

View file

@ -1,12 +1,17 @@
# Pull requests # Pull requests (for contributors)
- Always squash-merge the PR before merging
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Test your changes: - Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
- Execute [the full CI locally on your machine](ci/README.md) before publishing - Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
# Pull requests (for collaborators)
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
# Coding guidelines # Coding guidelines

View file

@ -11,7 +11,6 @@ BUILD_TARGETS = \
llama-embedding \ llama-embedding \
llama-eval-callback \ llama-eval-callback \
llama-export-lora \ llama-export-lora \
llama-finetune \
llama-gbnf-validator \ llama-gbnf-validator \
llama-gguf \ llama-gguf \
llama-gguf-hash \ llama-gguf-hash \
@ -37,7 +36,6 @@ BUILD_TARGETS = \
llama-simple \ llama-simple \
llama-speculative \ llama-speculative \
llama-tokenize \ llama-tokenize \
llama-train-text-from-scratch \
llama-vdot \ llama-vdot \
llama-cvector-generator \ llama-cvector-generator \
tests/test-c.o tests/test-c.o
@ -64,13 +62,13 @@ TEST_TARGETS = \
tests/test-tokenizer-1-spm tests/test-tokenizer-1-spm
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
# Deprecation aliases # Deprecation aliases
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
@ -876,6 +874,9 @@ OBJ_GGML += \
OBJ_LLAMA = \ OBJ_LLAMA = \
src/llama.o \ src/llama.o \
src/llama-vocab.o \
src/llama-grammar.o \
src/llama-sampling.o \
src/unicode.o \ src/unicode.o \
src/unicode-data.o src/unicode-data.o
@ -1055,6 +1056,10 @@ src/unicode-data.o: \
src/llama.o: \ src/llama.o: \
src/llama.cpp \ src/llama.cpp \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-grammar.h \
src/llama-sampling.h \
src/unicode.h \ src/unicode.h \
include/llama.h \ include/llama.h \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
@ -1064,6 +1069,29 @@ src/llama.o: \
ggml/include/ggml-backend.h ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-grammar.o: \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-sampling.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-sampling.o: \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_LLAMA): \ $(LIB_LLAMA): \
$(OBJ_LLAMA) \ $(OBJ_LLAMA) \
$(LIB_GGML) $(LIB_GGML)
@ -1266,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_GGML) $(OBJ_LLAMA)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1286,13 +1309,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-finetune: examples/finetune/finetune.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: examples/export-lora/export-lora.cpp \ llama-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_GGML) common/log.h $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1439,7 +1457,7 @@ run-benchmark-matmult: llama-benchmark-matmult
.PHONY: run-benchmark-matmult swift .PHONY: run-benchmark-matmult swift
tests/test-llama-grammar: tests/test-llama-grammar.cpp \ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1548,7 +1566,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
# #
# Mark legacy binary targets as .PHONY so that they are always checked. # Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server finetune .PHONY: main quantize perplexity embedding server
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time. # Eventually we will want to remove these target from building all the time.
@ -1591,13 +1609,3 @@ ifneq (,$(wildcard embedding))
@echo " Remove the 'embedding' binary to remove this warning." @echo " Remove the 'embedding' binary to remove this warning."
@echo "#########" @echo "#########"
endif endif
finetune: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard finetune))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
@echo " Remove the 'finetune' binary to remove this warning."
@echo "#########"
endif

View file

@ -4,6 +4,9 @@ import PackageDescription
var sources = [ var sources = [
"src/llama.cpp", "src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp", "src/unicode.cpp",
"src/unicode-data.cpp", "src/unicode-data.cpp",
"ggml/src/ggml.c", "ggml/src/ggml.c",

View file

@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
Unless otherwise noted these projects are open-source with permissive licensing: Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) - [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL) - [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo ## Demo
<details> <details>

View file

@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
return true; return true;
} }
if (arg == "--lora-base") {
CHECK_ARG
params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") { if (arg == "--control-vector") {
CHECK_ARG CHECK_ARG
params.control_vectors.push_back({ 1.0f, argv[i], }); params.control_vectors.push_back({ 1.0f, argv[i], });
@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
CHECK_ARG CHECK_ARG
params.out_file = argv[i]; params.out_file = argv[i];
params.cvector_outfile = argv[i]; params.cvector_outfile = argv[i];
params.lora_outfile = argv[i];
return true; return true;
} }
if (arg == "-ofreq" || arg == "--output-frequency") { if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1583,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n" "advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" }); "note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE", options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
@ -1676,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
options.push_back({ "export-lora" });
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
for (const auto & o : options) { for (const auto & o : options) {
@ -2721,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
const llama_chat_msg & new_msg, const llama_chat_msg & new_msg,
bool add_ass) { bool add_ass) {
std::ostringstream ss; std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg); std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version // if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@ -3166,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
} }
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
} }
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

View file

@ -128,7 +128,6 @@ struct gpt_params {
// TODO: avoid tuple, use struct // TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -255,6 +254,8 @@ struct gpt_params {
std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
}; };
void gpt_params_handle_hf_token(gpt_params & params); void gpt_params_handle_hf_token(gpt_params & params);

View file

@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token // Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY; bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
// apply grammar checks before sampling logic // apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) { if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
} }
return cur_p; return cur_p;
@ -455,6 +455,6 @@ void llama_sampling_accept(
ctx_sampling->prev.push_back(id); ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) { if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
} }
} }

View file

@ -48,7 +48,7 @@ class Model:
dir_model: Path dir_model: Path
ftype: gguf.LlamaFileType ftype: gguf.LlamaFileType
fname_out: Path | None fname_out: Path
is_big_endian: bool is_big_endian: bool
endianess: gguf.GGUFEndian endianess: gguf.GGUFEndian
use_temp_file: bool use_temp_file: bool
@ -62,11 +62,12 @@ class Model:
gguf_writer: gguf.GGUFWriter gguf_writer: gguf.GGUFWriter
model_name: str | None model_name: str | None
metadata_override: Path | None metadata_override: Path | None
dir_model_card: Path
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False, def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False, use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None, metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@ -90,6 +91,7 @@ class Model:
self.tensor_names = None self.tensor_names = None
self.metadata_override = metadata_override self.metadata_override = metadata_override
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -237,6 +239,10 @@ class Model:
self.gguf_writer.add_expert_used_count(n_experts_used) self.gguf_writer.add_expert_used_count(n_experts_used)
logger.info(f"gguf: experts used count = {n_experts_used}") logger.info(f"gguf: experts used count = {n_experts_used}")
if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}") logger.info(f"gguf: file type = {self.ftype}")
@ -345,7 +351,7 @@ class Model:
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params) self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
# Fallback to model directory name if metadata name is still missing # Fallback to model directory name if metadata name is still missing
if self.metadata.name is None: if self.metadata.name is None:
@ -359,27 +365,22 @@ class Model:
output_type: str = self.ftype.name.partition("_")[2] output_type: str = self.ftype.name.partition("_")[2]
# Filename Output # Filename Output
# Note: `not is_dir()` is used because `.is_file()` will not detect if self.fname_out.is_dir():
# file template strings as it doesn't actually exist as a file
if self.fname_out is not None and not self.fname_out.is_dir():
# Output path is a custom defined templated filename
# Process templated file name with the output ftype, useful with the "auto" ftype
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
else:
# Generate default filename based on model specification and available metadata # Generate default filename based on model specification and available metadata
if not vocab_only: if not vocab_only:
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
else: else:
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
# Check if preferred output directory path was provided # Use the default filename
if self.fname_out is not None and self.fname_out.is_dir():
# output path is a directory
self.fname_out = self.fname_out / f"{fname_default}.gguf" self.fname_out = self.fname_out / f"{fname_default}.gguf"
else: else:
# output in the same directory as the model by default # Output path is a custom defined templated filename
self.fname_out = self.dir_model / f"{fname_default}.gguf" # Note: `not is_dir()` is used because `.is_file()` will not detect
# file template strings as it doesn't actually exist as a file
# Process templated file name with the output ftype, useful with the "auto" ftype
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
self.set_type() self.set_type()
@ -593,6 +594,15 @@ class Model:
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b # ref: https://huggingface.co/core42/jais-13b
res = "jais" res = "jais"
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
res = "codeshell"
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
res = "tekken"
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
res = "smollm"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -733,7 +743,7 @@ class Model:
added_tokens_json = json.load(f) added_tokens_json = json.load(f)
for key in added_tokens_json: for key in added_tokens_json:
token_id = added_tokens_json[key] token_id = added_tokens_json[key]
if (token_id >= vocab_size): if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue continue
@ -750,7 +760,8 @@ class Model:
token_id = int(token_id) token_id = int(token_id)
token: str = token_data["content"] token: str = token_data["content"]
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token.encode("utf-8") if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token): if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
else: else:
@ -1309,6 +1320,7 @@ class RefactModel(Model):
special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("prefix", 1)
special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("suffix", 3)
special_vocab._set_special_token("middle", 2) special_vocab._set_special_token("middle", 2)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -1479,7 +1491,12 @@ class LlamaModel(Model):
super().set_gguf_parameters() super().set_gguf_parameters()
hparams = self.hparams hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear": if self.hparams["rope_scaling"].get("type") == "linear":
@ -1994,7 +2011,7 @@ class Phi3MiniModel(Model):
for key in added_tokens_json: for key in added_tokens_json:
token_id = added_tokens_json[key] token_id = added_tokens_json[key]
if (token_id >= vocab_size): if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue continue
@ -2011,7 +2028,8 @@ class Phi3MiniModel(Model):
token_id = int(token_id) token_id = int(token_id)
token = foken_data["content"].encode("utf-8") token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2027,7 +2045,8 @@ class Phi3MiniModel(Model):
token_id = int(foken_data["id"]) token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8") token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2065,10 +2084,11 @@ class Phi3MiniModel(Model):
self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_dimension_count(rope_dims)
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
# write rope scaling for long context (128k) model # write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True) rope_scaling = self.find_hparam(['rope_scaling'], True)
if (rope_scaling is None): if rope_scaling is None:
return return
scale = max_pos_embds / orig_max_pos_embds scale = max_pos_embds / orig_max_pos_embds
@ -2266,7 +2286,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id chat_eos_token_id = token_id
token = token.encode("utf-8") token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token) if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2285,7 +2306,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id chat_eos_token_id = token_id
token = token.encode("utf-8") token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token) if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2471,6 +2493,7 @@ class GemmaModel(Model):
special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107) special_vocab._set_special_token("eot", 107)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False) self.gguf_writer.add_add_space_prefix(False)
@ -2712,7 +2735,7 @@ class JinaBertV2Model(BertModel):
yield name, data yield name, data
def set_vocab(self, *args, **kwargs): def set_vocab(self):
tokenizer_class = 'BertTokenizer' tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_class = json.load(f)['tokenizer_class'] tokenizer_class = json.load(f)['tokenizer_class']
@ -2860,7 +2883,7 @@ class ArcticModel(Model):
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items(): for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id) token_id = int(token_id)
if (token_id >= vocab_size): if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue continue
@ -3109,7 +3132,7 @@ class T5Model(Model):
added_tokens_json = json.load(f) added_tokens_json = json.load(f)
for key in added_tokens_json: for key in added_tokens_json:
token_id = added_tokens_json[key] token_id = added_tokens_json[key]
if (token_id >= vocab_size): if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue continue
@ -3624,10 +3647,10 @@ def main() -> None:
logger.error("Error: Cannot use temp file when splitting") logger.error("Error: Cannot use temp file when splitting")
sys.exit(1) sys.exit(1)
fname_out = None
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else:
fname_out = dir_model
logger.info(f"Loading model: {dir_model.name}") logger.info(f"Loading model: {dir_model.name}")
@ -3658,7 +3681,6 @@ def main() -> None:
else: else:
logger.info("Exporting model...") logger.info("Exporting model...")
model_instance.write() model_instance.write()
assert model_instance.fname_out is not None
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
logger.info(f"Model successfully exported to {out_path}") logger.info(f"Model successfully exported to {out_path}")

View file

@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
# TODO: this string has to exercise as much pre-tokenizer functionality as possible # TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome # will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2: if len(sys.argv) == 2:
token = sys.argv[1] token = sys.argv[1]
@ -91,6 +91,9 @@ models = [
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
] ]
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
response = sess.get(url, headers=headers) response = sess.get(url, headers=headers)
response.raise_for_status() response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True) os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f: with open(save_path, 'wb') as downloaded_file:
f.write(response.content) downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully") logger.info(f"File {save_path} downloaded successfully")
@ -159,7 +162,7 @@ for model in models:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}") logger.info(f"model: {name}")
@ -191,7 +194,7 @@ src_func = f"""
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer # use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)} chktxt = {repr(CHK_TXT)}
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -287,7 +290,7 @@ tests = [
"333333333", "333333333",
"Cửa Việt", # llama-bpe fails on this "Cửa Việt", # llama-bpe fails on this
" discards", " discards",
chktxt, CHK_TXT,
] ]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp # write the tests to ./models/ggml-vocab-{name}.gguf.inp

View file

@ -132,6 +132,10 @@ class Tensor:
class GGMLModel: class GGMLModel:
file_format: GGMLFormat
format_version: int
def __init__(self): def __init__(self):
self.hyperparameters = None self.hyperparameters = None
self.vocab = None self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
if self.vocab_override is not None: if self.vocab_override is not None:
vo = self.vocab_override vo = self.vocab_override
logger.info('* Adding vocab item(s)') logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
toktypes.append(ttype) toktypes.append(ttype)

View file

@ -290,7 +290,7 @@ if __name__ == '__main__':
fname_out = args.outfile fname_out = args.outfile
else: else:
# output in the same directory as the model by default # output in the same directory as the model by default
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' fname_out = dir_lora
if os.path.exists(input_model): if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format. # lazy import load_file only if lora is in safetensors format.
@ -304,12 +304,6 @@ if __name__ == '__main__':
# load base model # load base model
logger.info(f"Loading base model: {dir_base_model.name}") logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model) hparams = Model.load_hparams(dir_base_model)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
with torch.inference_mode(): with torch.inference_mode():
try: try:
model_class = Model.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
@ -320,12 +314,21 @@ if __name__ == '__main__':
class LoraModel(model_class): class LoraModel(model_class):
model_arch = model_class.model_arch model_arch = model_class.model_arch
lora_alpha: float
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
super().__init__(*args, **kwargs)
self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha)
def set_type(self): def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters() super().set_gguf_parameters()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@ -368,6 +371,11 @@ if __name__ == '__main__':
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
model_instance = LoraModel( model_instance = LoraModel(
dir_base_model, dir_base_model,
ftype, ftype,
@ -376,6 +384,8 @@ if __name__ == '__main__':
use_temp_file=False, use_temp_file=False,
eager=args.no_lazy, eager=args.no_lazy,
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

View file

@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
```sh ```sh
./build/bin/llama-ls-sycl-device ./build/bin/llama-ls-sycl-device
``` ```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
``` ```
found 6 SYCL devices: found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| | | | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size| |ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
``` ```
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference 4. Launch inference
There are two device selection modes: There are two device selection modes:
- Single device: Use one device target specified by the user. - Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units. - Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter | | Device selection | Parameter |
|------------------|----------------------------------------| |------------------|----------------------------------------|
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
build\bin\ls-sycl-device.exe build\bin\ls-sycl-device.exe
``` ```
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
``` ```
found 6 SYCL devices: found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| | | | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size| |ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
``` ```
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference 4. Launch inference
There are two device selection modes: There are two device selection modes:
- Single device: Use one device assigned by user. - Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units. - Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter | | Device selection | Parameter |
|------------------|----------------------------------------| |------------------|----------------------------------------|

View file

@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
make make
``` ```
- On Windows: - On Windows (x86/x64 only, arm64 requires cmake):
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc. 2. Extract `w64devkit` on your pc.
@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
cmake -B build -G "Xcode" cmake -B build -G "Xcode"
cmake --build build --config Debug cmake --build build --config Debug
``` ```
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- Tab Workload: Desktop-development with C++
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
- For Windows on ARM (arm64, WoA) build with:
```bash
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
- Using `gmake` (FreeBSD): - Using `gmake` (FreeBSD):

View file

@ -21,7 +21,6 @@ else()
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(export-lora) add_subdirectory(export-lora)
add_subdirectory(finetune)
add_subdirectory(gbnf-validator) add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
@ -53,5 +52,4 @@ else()
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(train-text-from-scratch)
endif() endif()

View file

@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
| server | llama-server | | server | llama-server |
| llama-bench | llama-bench | | llama-bench | llama-bench |
| embedding | llama-embedding | | embedding | llama-embedding |
| finetune | llama-finetune |
| quantize | llama-quantize | | quantize | llama-quantize |
| tokenize | llama-tokenize | | tokenize | llama-tokenize |
| export-lora | llama-export-lora | | export-lora | llama-export-lora |
@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
| save-load-state | llama-save-load-state | | save-load-state | llama-save-load-state |
| simple | llama-simple | | simple | llama-simple |
| speculative | llama-speculative | | speculative | llama-speculative |
| train-text-from-scratch | llama-train-text-from-scratch |
| vdot | llama-vdot | | vdot | llama-vdot |
| tests/test-c.o | tests/test-c.o | | tests/test-c.o | tests/test-c.o |

View file

@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
usage: llama-export-lora [options] usage: llama-export-lora [options]
options: options:
-h, --help show this help message and exit -m, --model model path from which to load base model (default '')
-m FNAME, --model-base FNAME model path from which to load base model (default '') --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
-o FNAME, --model-out FNAME path to save exported model (default '') --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
-l FNAME, --lora FNAME apply LoRA adapter -t, --threads N number of threads to use during computation (default: 4)
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
-t N, --threads N number of threads to use during computation (default: 4)
``` ```
For example: For example:
@ -20,7 +19,15 @@ For example:
./bin/llama-export-lora \ ./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \ -m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
``` ```
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters. Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
```bash
./bin/llama-export-lora \
-m your_base_model.gguf \
-o your_merged_model.gguf \
--lora-scaled lora_task_A.gguf 0.5 \
--lora-scaled lora_task_B.gguf 0.5
```

View file

@ -1,465 +1,420 @@
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include <map>
#include <vector> #include <vector>
#include <string> #include <string>
#include <thread> #include <thread>
#include <fstream>
struct lora_info { static bool g_verbose = false;
std::string filename;
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
}
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
throw std::runtime_error("failed to load input GGUF from " + fname);
}
return ctx_gguf;
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
struct file_input {
struct ggml_context * ctx_meta = nullptr;
struct gguf_context * ctx_gguf = nullptr;
std::ifstream f_in;
std::map<std::string, ggml_tensor *> tensors;
float alpha;
float scale; float scale;
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
if (!f_in.is_open()) {
throw std::runtime_error("failed to open input gguf from " + fname);
}
ctx_gguf = load_gguf(fname, &ctx_meta);
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
std::string name(cur->name);
tensors[name] = cur;
if (g_verbose) {
printf("%s: %s\n", __func__, cur->name);
}
}
}
ggml_tensor * get_tensor(std::string name) {
if (tensors.find(name) == tensors.end()) {
return nullptr;
}
return tensors[name];
}
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
if (tensors.find(name) == tensors.end()) {
throw std::runtime_error("cannot find tensor with name: " + name);
}
auto len = ggml_nbytes(tensors[name]);
if (buf.size() < len) {
buf.resize(len);
}
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
f_in.seekg(offset);
f_in.read((char* )buf.data(), len);
}
~file_input() {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
}
}; };
struct export_lora_params { struct lora_merge_ctx {
std::string fn_model_base; // input base model + adapters
std::string fn_model_out; file_input base_model;
std::vector<struct lora_info> lora; std::vector<std::unique_ptr<file_input>> adapters;
// for computing merged tensor
int n_threads; int n_threads;
ggml_backend_t backend = nullptr;
ggml_gallocr_t allocr = nullptr;
std::vector<uint8_t> read_buf;
// output file
struct gguf_context * ctx_out;
struct ggml_context * ctx_out_ggml;
std::ofstream fout;
lora_merge_ctx(
std::string & base_fname,
std::vector<std::tuple<std::string, float>> & lora_files,
std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
throw std::runtime_error("split model is not yet supported");
}
for (auto lora_inp : lora_files) {
auto fname = std::get<0>(lora_inp);
auto scale = std::get<1>(lora_inp);
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
check_metadata_lora(adapter.get());
adapters.push_back(std::move(adapter));
}
ctx_out = gguf_init_empty();
struct ggml_init_params params = {
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
}; };
ctx_out_ggml = ggml_init(params);
backend = ggml_backend_cpu_init();
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
struct lora_data { void check_metadata_lora(file_input * adapter) {
struct lora_info info; auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
std::vector<uint8_t> data; if (general_type != "adapter") {
struct ggml_context * ctx; throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
uint32_t lora_r; auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
uint32_t lora_alpha; if (adapter_type != "lora") {
}; throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
struct llama_file { auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
// use FILE * so we don't have to re-open the file to mmap auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
FILE * fp; if (general_arch_base != general_arch_lora) {
size_t size; throw std::runtime_error("model arch and LoRA arch mismatch");
}
}
llama_file(const char * fname, const char * mode) { ggml_type get_out_tensor_type(struct ggml_tensor * t) {
fp = std::fopen(fname, mode); if (t->type == GGML_TYPE_F32) {
if (fp == NULL) { return GGML_TYPE_F32;
size = 0;
} else { } else {
seek(0, SEEK_END); return GGML_TYPE_F16;
size = tell();
seek(0, SEEK_SET);
} }
} }
size_t tell() const { void run_merge() {
#ifdef _WIN32 // prepare metadata
__int64 ret = _ftelli64(fp); gguf_set_kv(ctx_out, base_model.ctx_gguf);
#else // output is forced to f16 for now
long ret = std::ftell(fp); gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) { // check if all lora adapters have the same tensors
#ifdef _WIN32 // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
int ret = _fseeki64(fp, (__int64) offset, whence); static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
#else if (adapters.size() > 1) {
int ret = std::fseek(fp, (long) offset, whence); for (size_t i = 1; i < adapters.size(); ++i) {
#endif if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
GGML_ASSERT(ret == 0); // same throw std::runtime_error(err_no_subset_adapter);
} }
for (auto & it : adapters[i]->tensors) {
void read_raw(void * ptr, size_t size) { if (adapters[0]->get_tensor(it.first) == nullptr) {
if (size == 0) { throw std::runtime_error(err_no_subset_adapter);
return;
} }
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
die_fmt("read error: %s", strerror(errno));
} }
if (ret != 1) {
die("unexpectedly reached end of file");
} }
} }
std::uint32_t read_u32() { // mapping base tensor to out tensor (same shape with base, but different type)
std::uint32_t ret; // if out_tensor == nullptr, we only copy it
read_raw(&ret, sizeof(ret)); std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
return ret; for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
for (auto & adapter : adapters) {
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
} }
auto base_tensor = it.second;
std::string read_string(std::uint32_t len) { if (!t_a && !t_b) {
std::vector<char> chars(len); // only copy
read_raw(chars.data(), len); struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
return std::string(chars.data(), len); ggml_set_name(cpy_tensor, base_tensor->name);
} base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
gguf_add_tensor(ctx_out, cpy_tensor);
void write_raw(const void * ptr, size_t size) { } else if (t_a && t_b) {
if (size == 0) { // need merging
return; struct ggml_tensor * out_tensor = ggml_new_tensor(
} ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
errno = 0; ggml_set_name(out_tensor, base_tensor->name);
size_t ret = std::fwrite(ptr, size, 1, fp); base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
if (ret != 1) { gguf_add_tensor(ctx_out, out_tensor);
die_fmt("write error: %s", strerror(errno));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
bool eof() {
return tell() >= size;
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
static struct export_lora_params get_default_export_lora_params() {
struct export_lora_params result;
result.fn_model_base = "";
result.fn_model_out = "";
result.n_threads = GGML_DEFAULT_N_THREADS;
return result;
}
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
}
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
bool invalid_param = false;
std::string arg;
struct export_lora_params default_params = get_default_export_lora_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-m" || arg == "--model-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_base = argv[i];
} else if (arg == "-o" || arg == "--model-out") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_out = argv[i];
} else if (arg == "-l" || arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
lora.scale = 1.0f;
params->lora.push_back(lora);
} else if (arg == "-s" || arg == "--lora-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
lora.scale = std::stof(argv[i]);
params->lora.push_back(lora);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->n_threads = std::stoi(argv[i]);
if (params->n_threads <= 0) {
params->n_threads = std::thread::hardware_concurrency();
}
} else if (arg == "-h" || arg == "--help") {
export_lora_print_usage(argc, argv, &default_params);
exit(0);
} else { } else {
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
} }
} }
if (params->fn_model_base == default_params.fn_model_base) { // placeholder for the meta data
fprintf(stderr, "error: please specify a filename for model-base.\n"); {
export_lora_print_usage(argc, argv, &default_params); size_t meta_size = gguf_get_meta_size(ctx_out);
exit(1); zeros(fout, meta_size);
}
if (params->fn_model_out == default_params.fn_model_out) {
fprintf(stderr, "error: please specify a filename for model-out.\n");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
return true;
} }
static void free_lora(struct lora_data * lora) { // process base model tensors
if (lora->ctx != NULL) { size_t n_merged = 0;
ggml_free(lora->ctx); for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
n_merged++;
} else {
copy_tensor(it.first);
} }
delete lora;
} }
static struct lora_data * load_lora(struct lora_info * info) { // write output metadata
struct lora_data * result = new struct lora_data; {
result->info = *info; std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
result->ctx = NULL; gguf_get_meta_data(ctx_out, data.data());
result->lora_r = 1; fout.seekp(0);
result->lora_alpha = 1; fout.write((const char *)data.data(), data.size());
struct llama_file file(info->filename.c_str(), "rb");
if (file.fp == NULL) {
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
info->filename.c_str());
free_lora(result);
return NULL;
} }
struct ggml_init_params params_ggml; printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
uint32_t magic = file.read_u32();
if (magic != LLAMA_FILE_MAGIC_GGLA) {
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
}
uint32_t version = file.read_u32();
if (version != 1) {
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
}
result->lora_r = file.read_u32();
result->lora_alpha = file.read_u32();
// read tensor infos from file
std::vector<char> name_buf;
std::vector<struct ggml_tensor *> tensors;
std::vector<size_t> tensors_offset;
size_t total_nbytes_pad = 0;
while(!file.eof()) {
int64_t ne[4] = {1,1,1,1};
uint32_t n_dims = file.read_u32();
uint32_t namelen = file.read_u32();
uint32_t type = file.read_u32();
for (uint32_t k = 0; k < n_dims; ++k) {
ne[k] = (int64_t)file.read_u32();
}
name_buf.clear();
name_buf.resize(namelen + 1, '\0');
file.read_raw(name_buf.data(), namelen);
file.seek((0-file.tell()) & 31, SEEK_CUR);
size_t offset = file.tell();
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
ggml_set_name(tensor, name_buf.data());
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
total_nbytes_pad += nbytes_pad;
tensors.push_back(tensor);
tensors_offset.push_back(offset);
file.seek(nbytes, SEEK_CUR);
}
// read tensor data
result->data.resize(total_nbytes_pad);
size_t data_offset = 0;
for (size_t i = 0; i < tensors.size(); ++i) {
struct ggml_tensor * tensor = tensors[i];
size_t offset = tensors_offset[i];
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
file.seek(offset, SEEK_SET);
tensor->data = result->data.data() + data_offset;
file.read_raw(tensor->data, nbytes);
data_offset += nbytes_pad;
}
return result;
} }
void copy_tensor(struct ggml_tensor * base) {
static struct ggml_cgraph * build_graph_lora( printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
struct ggml_context * ctx, size_t len = ggml_nbytes(base);
struct ggml_tensor * tensor, base_model.read_tensor_data(base->name, read_buf);
struct ggml_tensor * lora_a, fout.write((char* )read_buf.data(), len);
struct ggml_tensor * lora_b, zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
float scaling
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, scaling);
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand (gf, res);
return gf;
} }
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) { void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
if (lora->ctx == NULL) { std::string name_base(base->name);
return false; std::string name_lora_a = name_base + ".lora_a";
std::string name_lora_b = name_base + ".lora_b";
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
// context for input tensor
std::vector<struct ggml_tensor *> inp_a(adapters.size());
std::vector<struct ggml_tensor *> inp_b(adapters.size());
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
// alloc tensors
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b);
} }
std::string name = ggml_get_name(tensor); ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
std::string name_a = name + std::string(".loraA");
std::string name_b = name + std::string(".loraB"); // load base tensor to backend buffer
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str()); base_model.read_tensor_data(name_base, read_buf);
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str()); if (base->type != GGML_TYPE_F32) {
if (lora_a == NULL || lora_b == NULL) { // optionally dequantize it
return false; printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
} }
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; // load lora tensors to backend buffer
for (size_t i = 0; i < adapters.size(); ++i) {
adapters[i]->read_tensor_data(name_lora_a, read_buf);
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
adapters[i]->read_tensor_data(name_lora_b, read_buf);
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
}
struct ggml_init_params params; // build graph
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; struct ggml_cgraph * gf;
params.mem_buffer = NULL; {
params.no_alloc = true; static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
struct ggml_context * ctx = NULL; static std::vector<uint8_t> buf(buf_size);
struct ggml_gallocr * alloc = NULL; struct ggml_init_params params0 = {
struct ggml_cgraph * gf = NULL; /*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params0);
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, out->type);
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
}
ctx = ggml_init(params); // compute
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); {
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); ggml_gallocr_alloc_graph(allocr, gf);
ggml_backend_cpu_set_n_threads(backend, n_threads);
ggml_backend_graph_compute(backend, gf);
}
ggml_gallocr_alloc_graph(alloc, gf); // write data to output file
{
auto result = gf->nodes[gf->n_nodes - 1];
size_t len = ggml_nbytes(result);
if (read_buf.size() < len) {
read_buf.resize(len);
}
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
static std::vector<uint8_t> data_work;
data_work.resize(cplan.work_size);
cplan.work_data = data_work.data();
ggml_graph_compute(gf, &cplan);
ggml_gallocr_free(alloc);
ggml_free(ctx); ggml_free(ctx);
return true; ggml_backend_buffer_free(buffer);
} }
static void export_lora(struct export_lora_params * params) { ~lora_merge_ctx() {
// load all loras ggml_gallocr_free(allocr);
std::vector<struct lora_data *> loras; ggml_backend_free(backend);
for (size_t i = 0; i < params->lora.size(); ++i) { gguf_free(ctx_out);
struct lora_data * lora = load_lora(&params->lora[i]); ggml_free(ctx_out_ggml);
if (lora != NULL) {
loras.push_back(lora);
}
}
if (loras.size() == 0) {
fprintf(stderr, "warning: no lora adapters will be applied.\n");
} }
};
// open input file static void print_usage(int argc, char ** argv, const gpt_params & params) {
struct llama_file fin(params->fn_model_base.c_str(), "rb"); gpt_params_print_usage(argc, argv, params);
if (!fin.fp) {
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
}
// open base model gguf, read tensors without their data printf("\nexample usage:\n");
struct ggml_context * ctx_in; printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
struct gguf_init_params params_gguf; printf("\nNOTE: output model is F16\n");
params_gguf.no_alloc = true;
params_gguf.ctx = &ctx_in;
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
// create new gguf
struct gguf_context * gguf_out = gguf_init_empty();
// copy meta data from base model: kv and tensors
gguf_set_kv(gguf_out, gguf_in);
int n_tensors = gguf_get_n_tensors(gguf_in);
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
gguf_add_tensor(gguf_out, tensor);
}
// create output file
struct llama_file fout(params->fn_model_out.c_str(), "wb");
if (!fout.fp) {
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
}
// write gguf meta data
std::vector<uint8_t> meta;
meta.resize(gguf_get_meta_size(gguf_out));
gguf_get_meta_data(gguf_out, meta.data());
fout.write_raw(meta.data(), meta.size());
std::vector<uint8_t> data;
std::vector<uint8_t> padding;
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
// read tensor data
data.resize(ggml_nbytes(tensor));
tensor->data = data.data();
size_t offset = gguf_get_tensor_offset(gguf_in, i);
fin.seek(offset + meta.size(), SEEK_SET);
fin.read_raw(data.data(), data.size());
// apply all loras
for (size_t k = 0; k < loras.size(); ++k) {
apply_lora(tensor, loras[k], params->n_threads);
}
// write tensor data + padding
padding.clear();
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
GGML_ASSERT(fout.tell() == offset + meta.size());
// fout.seek(offset + meta.size(), SEEK_SET);
fout.write_raw(data.data(), data.size());
fout.write_raw(padding.data(), padding.size());
if (i % 2 == 0) {
printf(".");
}
}
printf("\n"); printf("\n");
// close gguf
gguf_free(gguf_out);
gguf_free(gguf_in);
// free loras
for (size_t i = 0; i < loras.size(); ++i) {
free_lora(loras[i]);
}
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
struct export_lora_params params = get_default_export_lora_params(); gpt_params params;
if (!export_lora_params_parse(argc, argv, &params)) { if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
export_lora(&params); g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
return 0; return 0;
} }

View file

@ -1,5 +0,0 @@
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,90 +0,0 @@
# finetune
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter
./bin/llama-finetune \
--model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
--train-data "shakespeare.txt" \
--save-every 10 \
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing
# predict
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
After 10 more iterations:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
The default LORA rank can be specified with `--lora-r N`.
The LORA rank can be configured for each model tensor type separately with these command line options:
```bash
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
--rank-att-norm N LORA rank for attention norm tensor (default 1)
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
--rank-out-norm N LORA rank for output norm tensor (default 1)
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
--rank-out N LORA rank for output tensor (default 4)
--rank-wq N LORA rank for wq tensor (default 4)
--rank-wk N LORA rank for wk tensor (default 4)
--rank-wv N LORA rank for wv tensor (default 4)
--rank-wo N LORA rank for wo tensor (default 4)
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
```
The LORA rank of 'norm' tensors should always be 1.
To see all available options use `llama-finetune --help`.

View file

@ -1,487 +0,0 @@
#!/usr/bin/env python3
# finetune checkpoint --> gguf conversion
import argparse
import gguf
import struct
import numpy as np
from pathlib import Path
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
LLM_KV_TRAINING_TYPE = "training.type"
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
class Tensor:
def __init__(self, dtype='f', ne=None):
if ne is None:
ne = []
self.dtype = dtype
self.ne = ne
self.nbytes = 0
if self.dtype == 'f':
if len(self.ne) == 0:
self.nbytes = 0
else:
self.nbytes = int(np.prod(self.ne)) * 4
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
def load(self, data, offset):
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
assert(nd == len(self.ne))
ne = []
for d in range(nd):
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
ne.append(n)
if tuple(ne) != tuple(self.ne):
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
if self.dtype == 'f':
assert(dtype == 0)
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
self.name = bytes(data[offset:offset+namelen]); offset += namelen
# 32-byte alignment
offset += (0 - offset) & 31
self.data = data[offset:offset+self.nbytes]
offset += self.nbytes
return offset
def max_storage_size(self):
result = 0
result += 4 # nd
result += 4 # namelen
result += 4 # dtype
result += len(self.ne)*8 # ne
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
result += 31 # 32-byte alignment
result += self.nbytes
return result
def save_gguf(self, gguf_writer, name):
gguf_writer.add_tensor(
name=name,
tensor=self.data,
raw_shape=np.array(list(reversed(self.ne))),
raw_dtype=gguf.GGMLQuantizationType.F32)
class OptimizationContext:
def __init__(self):
pass
def load(self, data, offset):
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
offset += 4
if self.version != 1:
raise ValueError('Invalid version of optimization context in checkpoint file')
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
# forgot to save type in version 1:
# guess self.type from number of remaining bytes
size_type_0 = 12 + sum([t.max_storage_size() for t in
[self.adam_m, self.adam_v]
+([self.adam_pf] if (self.past > 0) else [])])
size_type_1 = 24 + sum([t.max_storage_size() for t in
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
self.lbfgs_lmal, self.lbfgs_lmys,
self.lbfgs_lms, self.lbfgs_lmy]
+([self.lbfgs_pf] if (self.past > 0) else [])])
# due to alignment padding the size might not by exact
# but the difference in size for both types is significant,
# so we can just use whichever is closest
remaining = len(data) - offset
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
self.type = 0
else:
self.type = 1
if self.type == 0:
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = self.adam_pf.load(data,offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError(f"Invalid optimizer type '{self.type}'")
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
if self.type == 0:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
if self.past > 0:
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
elif self.type == 1:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
if self.past > 0:
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
else:
raise ValueError('Unknown optimizer type')
class LoraParams:
def __init__(self):
pass
def load(self, data, offset):
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
class ModelParams:
def __init__(self, n_ff = None):
self.n_ff = n_ff
def load(self, data, offset):
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def get_n_ff(self):
if self.n_ff is None:
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
else:
return self.n_ff
def save_gguf(self, gguf_writer):
# self.n_vocab not saved
gguf_writer.add_embedding_length(self.n_embd)
gguf_writer.add_head_count(self.n_head)
gguf_writer.add_block_count(self.n_layer)
gguf_writer.add_rope_dimension_count(self.n_rot)
gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None, suffix=".weight"):
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
class Layer:
def __init__(self, params, lora_params, bid):
self.bid = bid
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
def load(self, data, offset):
offset = self.att_norm_a.load(data, offset)
offset = self.att_norm_b.load(data, offset)
offset = self.wq_a.load(data, offset)
offset = self.wq_b.load(data, offset)
offset = self.wk_a.load(data, offset)
offset = self.wk_b.load(data, offset)
offset = self.wv_a.load(data, offset)
offset = self.wv_b.load(data, offset)
offset = self.wo_a.load(data, offset)
offset = self.wo_b.load(data, offset)
offset = self.ffn_norm_a.load(data, offset)
offset = self.ffn_norm_b.load(data, offset)
offset = self.w1_a.load(data, offset)
offset = self.w1_b.load(data, offset)
offset = self.w2_a.load(data, offset)
offset = self.w2_b.load(data, offset)
offset = self.w3_a.load(data, offset)
offset = self.w3_b.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
class LoraModel:
def __init__(self, n_ff = None):
self.params = ModelParams(n_ff = n_ff)
self.lora_params = LoraParams()
self.layers = []
def load(self, data, offset):
offset = self.params.load(data, offset)
offset = self.lora_params.load(data, offset)
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
offset = self.tok_embd_a.load(data, offset)
offset = self.tok_embd_b.load(data, offset)
offset = self.norm_a.load(data, offset)
offset = self.norm_b.load(data, offset)
offset = self.output_a.load(data, offset)
offset = self.output_b.load(data, offset)
self.layers.clear()
for bid in range(self.params.n_layer):
layer = Layer(self.params, self.lora_params, bid)
offset = layer.load(data, offset)
self.layers.append(layer)
return offset
def save_gguf(self, gguf_writer):
self.params.save_gguf(gguf_writer)
self.lora_params.save_gguf(gguf_writer)
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
for layer in self.layers:
layer.save_gguf(gguf_writer)
class LoraCheckpoint:
def __init__(self, n_ff = None):
self.model = LoraModel(n_ff = n_ff)
self.opt_ctx = OptimizationContext()
def load(self, data, offset):
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
if magic != b'ggcl':
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
if self.version != 0:
raise ValueError('Invalid version of checkpoint file')
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
offset = self.model.load(data, offset)
offset = self.opt_ctx.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
gguf_writer.add_layer_norm_rms_eps(1e-5)
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
self.model.save_gguf(gguf_writer)
self.opt_ctx.save_gguf(gguf_writer)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
return parser.parse_args()
def main():
cfg = handle_args()
print(cfg)
data = np.memmap(cfg.input, mode = 'r')
chk = LoraCheckpoint(n_ff = cfg.ff)
offset = 0
offset = chk.load(data, offset)
# we should have read all available data
assert(offset == len(data))
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
chk.save_gguf(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load diff

View file

@ -1,34 +0,0 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./llama-finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View file

@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
auto decoded = decode_utf8(input_str, {}); auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first; const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
size_t pos = 0; size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks; const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) { llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
error_pos = pos; error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks; cur_stacks = prev_stacks;
return false; return false;
} }
++pos; ++pos;
} }
for (const auto & stack : grammar->stacks) { for (const auto & stack : cur_stacks) {
if (stack.empty()) { if (stack.empty()) {
return true; return true;
} }

View file

@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

View file

@ -1,6 +1,6 @@
# llama.cpp/examples/imatrix # llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
## Usage ## Usage

View file

@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF(""); return nullptr;
} }
auto new_token_chars = llama_token_to_piece(context, new_token_id); auto new_token_chars = llama_token_to_piece(context, new_token_id);

View file

@ -26,11 +26,12 @@ actor LlamaContext {
private var context: OpaquePointer private var context: OpaquePointer
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
var is_done: Bool = false
/// This variable is used to store temporarily invalid cchars /// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar] private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 64 var n_len: Int32 = 1024
var n_cur: Int32 = 0 var n_cur: Int32 = 0
var n_decode: Int32 = 0 var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n") print("\n")
is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0]) let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll() temporary_invalid_cchars.removeAll()
return new_token_str return new_token_str

View file

@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
messageLog += "\(text)" messageLog += "\(text)"
Task.detached { Task.detached {
while await llamaContext.n_cur < llamaContext.n_len { while await !llamaContext.is_done {
let result = await llamaContext.completion_loop() let result = await llamaContext.completion_loop()
await MainActor.run { await MainActor.run {
self.messageLog += "\(result)" self.messageLog += "\(result)"

View file

@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
auto formatted = llama_chat_format_single( auto formatted = llama_chat_format_single(
model, g_params->chat_template, chat_msgs, new_msg, role == "user"); model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content}); chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str());
return formatted; return formatted;
} }

289
examples/pydantic_models_to_grammar_examples.py Normal file → Executable file
View file

@ -1,8 +1,15 @@
# Function calling example using pydantic models. #!/usr/bin/env python3
"""Function calling example using pydantic models."""
from __future__ import annotations from __future__ import annotations
import argparse
import datetime import datetime
import json import json
import logging
import textwrap
import sys
from enum import Enum from enum import Enum
from typing import Optional, Union from typing import Optional, Union
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
# Function to get completion on the llama.cpp server with grammar. def create_completion(host, prompt, gbnf_grammar):
def create_completion(prompt, grammar): """Calls the /completion API on llama-server.
See
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
"""
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = {"prompt": prompt, "grammar": grammar} data = {"prompt": prompt, "grammar": gbnf_grammar}
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
data = response.json()
assert data.get("error") is None, data assert data.get("error") is None, data
logging.info("Result: %s", result)
print(data["content"]) content = result["content"]
return data["content"] print(f" Model: {result['model']}")
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
return content
# A function for the agent to send a message to the user. # A function for the agent to send a message to the user.
class SendMessageToUser(BaseModel): class SendMessageToUser(BaseModel):
""" """Send a message to the User."""
Send a message to the User.
"""
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
message: str = Field(..., description="Message you want to send to the user.") message: str = Field(..., description="Message you want to send to the user.")
def run(self): def run(self):
print(self.message) print(f"SendMessageToUser: {self.message}")
def example_rce(host):
"""Minimal test case where the LLM call an arbitrary python function."""
print("- example_rce")
tools = [SendMessageToUser]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
tools_map = {tool.__name__:tool for tool in tools}
# This finds "SendMessageToUser":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
tool(**json_data["function_parameters"]).run()
return 0
# Enum for the calculator tool. # Enum for the calculator tool.
@ -46,11 +77,11 @@ class MathOperation(Enum):
DIVIDE = "divide" DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. # Simple pydantic calculator tool for the agent that can add, subtract,
# multiply, and divide. Docstring and description of fields will be used in
# system prompt.
class Calculator(BaseModel): class Calculator(BaseModel):
""" """Perform a math operation on two numbers."""
Perform a math operation on two numbers.
"""
number_one: Union[int, float] = Field(..., description="First number.") number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.") operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.") number_two: Union[int, float] = Field(..., description="Second number.")
@ -68,55 +99,61 @@ class Calculator(BaseModel):
raise ValueError("Unknown operation.") raise ValueError("Unknown operation.")
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. def example_calculator(host):
# pydantic_model_list is the list of pydanitc models """Have the LLM ask to get a calculation done.
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
# outer_object_content is the name of outer object content. Here the grammar gets generated by passing the available function models to
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") generate_gbnf_grammar_and_documentation function. This also generates a
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") documentation usable by the LLM.
pydantic_model_list is the list of pydantic models outer_object_name is an
optional name for an outer object around the actual model object. Like a
"function" object with "function_parameters" which contains the actual model
object. If None, no outer object will be generated outer_object_content is
the name of outer object content.
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
"""
print("- example_calculator")
tools = [SendMessageToUser, Calculator]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
print(gbnf_grammar)
print(documentation)
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message1 = "What is 42 * 42?"
user_message = "What is 42 * 42?" prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
text = create_completion(prompt=prompt, grammar=gbnf_grammar) expected = {
# This should output something like this: "function": "Calculator",
# { "function_parameters": {
# "function": "calculator", "number_one": 42,
# "function_parameters": { "operation": "multiply",
# "number_one": 42, "number_two": 42
# "operation": "multiply", }
# "number_two": 42 }
# } if json_data != expected:
# } print(" Result is not as expected!")
function_dictionary = json.loads(text) tools_map = {tool.__name__:tool for tool in tools}
if function_dictionary["function"] == "calculator": # This finds "Calculator":
function_parameters = {**function_dictionary["function_parameters"]} tool = tools_map.get(json_data["function"])
if not tool:
print(Calculator(**function_parameters).run()) print(f"Error: unknown tool {json_data['function']}")
# This should output: 1764 return 1
result = tool(**json_data["function_parameters"]).run()
print(f" Call {json_data['function']} gave result {result}")
return 0
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
class Category(Enum): class Category(Enum):
""" """The category of the book."""
The category of the book.
"""
Fiction = "Fiction" Fiction = "Fiction"
NonFiction = "Non-Fiction" NonFiction = "Non-Fiction"
class Book(BaseModel): class Book(BaseModel):
""" """Represents an entry about a book."""
Represents an entry about a book.
"""
title: str = Field(..., description="Title of the book.") title: str = Field(..., description="Title of the book.")
author: str = Field(..., description="Author of the book.") author: str = Field(..., description="Author of the book.")
published_year: Optional[int] = Field(..., description="Publishing year of the book.") published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@ -125,33 +162,42 @@ class Book(BaseModel):
summary: str = Field(..., description="Summary of the book.") summary: str = Field(..., description="Summary of the book.")
# We need no additional parameters other than our list of pydantic models. def example_struct(host):
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) """A example structured output based on pydantic models.
The LLM will create an entry for a Book database out of an unstructured
text. We need no additional parameters other than our list of pydantic
models.
"""
print("- example_struct")
tools = [Book]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text) json_data = json.loads(text)
# In this case, there's no function nor function_parameters.
# Here the result will vary based on the LLM used.
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
if keys != sorted(json_data.keys()):
print(f"Unexpected result: {sorted(json_data.keys())}")
return 1
book = Book(**json_data)
print(f" As a Book object: %s" % book)
return 0
print(Book(**json_data))
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
def get_current_datetime(output_format: Optional[str] = None): def get_current_datetime(output_format: Optional[str] = None):
""" """Get the current date and time in the given format.
Get the current date and time in the given format.
Args: Args:
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
""" """
if output_format is None: return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
output_format = '%Y-%m-%d %H:%M:%S'
return datetime.datetime.now().strftime(output_format)
# Example function to get the weather # Example function to get the weather.
def get_current_weather(location, unit): def get_current_weather(location, unit):
"""Get the current weather in a given location""" """Get the current weather in a given location"""
if "London" in location: if "London" in location:
@ -160,11 +206,15 @@ def get_current_weather(location, unit):
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
elif "North Pole" in location: elif "North Pole" in location:
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
else:
return json.dumps({"location": location, "temperature": "unknown"}) return json.dumps({"location": location, "temperature": "unknown"})
# Here is a function definition in OpenAI style def example_concurrent(host):
"""An example for parallel function calling with a Python function, a pydantic
function model and an OpenAI like function definition.
"""
print("- example_concurrent")
# Function definition in OpenAI style.
current_weather_tool = { current_weather_tool = {
"type": "function", "type": "function",
"function": { "function": {
@ -183,45 +233,80 @@ current_weather_tool = {
}, },
}, },
} }
# Convert OpenAI function definition into pydantic model.
# Convert OpenAI function definition into pydantic model
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model # Add the actual function to a pydantic model.
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert normal Python function to a pydantic model # Convert normal Python function to a pydantic model.
current_datetime_model = create_dynamic_model_from_function(get_current_datetime) current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tool_list, outer_object_name="function", pydantic_model_list=tools, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
json_data = json.loads(text) json_data = json.loads(text)
expected = [
print(json_data) {
# Should output something like this: "function": "get_current_datetime",
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] "params": {
"output_format": "%Y-%m-%d %H:%M:%S"
}
},
{
"function": "get_current_weather",
"params": {
"location": "London",
"unit": "celsius"
}
},
{
"function": "Calculator",
"params": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
]
res = 0
if json_data != expected:
print(" Result is not as expected!")
print(" This can happen on highly quantized models")
res = 1
tools_map = {tool.__name__:tool for tool in tools}
for call in json_data: for call in json_data:
if call["function"] == "Calculator": tool = tools_map.get(call["function"])
print(Calculator(**call["params"]).run()) if not tool:
elif call["function"] == "get_current_datetime": print(f"Error: unknown tool {call['function']}")
print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] return 1
elif call["function"] == "get_current_weather": result = tool(**call["params"]).run()
print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] print(f" Call {call['function']} returned {result}")
# Should output something like this: # Should output something like this:
# 2024-01-14 13:36:06 # Call get_current_datetime returned 2024-07-15 09:50:38
# {"location": "London", "temperature": "42", "unit": "celsius"} # Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
# 1764 # Call Calculator returned 1764
return res
def main():
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
ret = 0
# Comment out below to only run the example you want.
ret = ret or example_rce(args.host)
ret = ret or example_calculator(args.host)
ret = ret or example_struct(args.host)
ret = ret or example_concurrent(args.host)
return ret
if __name__ == "__main__":
sys.exit(main())

View file

@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
Set of LLM REST APIs and a simple web front end to interact with llama.cpp. Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
**Features:** **Features:**
* LLM inference of F16 and quantum models on GPU and CPU * LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
* Parallel decoding with multi-user support * Parallel decoding with multi-user support
* Continuous batching * Continuous batching
@ -444,7 +444,7 @@ node index.js
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

View file

@ -225,7 +225,7 @@
throw new Error("already running"); throw new Error("already running");
} }
controller.value = new AbortController(); controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) { for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
const data = chunk.data; const data = chunk.data;
if (data.stop) { if (data.stop) {
while ( while (

View file

@ -1,5 +1,4 @@
<html> <html>
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
@ -132,12 +131,20 @@
align-items: stretch; align-items: stretch;
} }
.right { .message-controls {
display: flex; display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end; justify-content: flex-end;
} }
.message-controls > div:nth-child(2) {
display: flex;
flex-direction: column;
gap: 0.5em;
}
.message-controls > div:nth-child(2) > div {
display: flex;
margin-left: auto;
gap: 0.5em;
}
fieldset { fieldset {
border: none; border: none;
@ -276,6 +283,7 @@
import { llama } from './completion.js'; import { llama } from './completion.js';
import { SchemaConverter } from './json-schema-to-grammar.mjs'; import { SchemaConverter } from './json-schema-to-grammar.mjs';
let selected_image = false; let selected_image = false;
var slot_id = -1; var slot_id = -1;
@ -447,6 +455,9 @@
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */ /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
const tts = window.speechSynthesis;
const ttsVoice = signal(null)
const llamaStats = signal(null) const llamaStats = signal(null)
const controller = signal(null) const controller = signal(null)
@ -479,7 +490,7 @@
throw new Error("already running"); throw new Error("already running");
} }
controller.value = new AbortController(); controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) { for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
const data = chunk.data; const data = chunk.data;
if (data.stop) { if (data.stop) {
@ -596,8 +607,51 @@
}); });
} }
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
function MessageInput() { function MessageInput() {
const message = useSignal("") const message = useSignal("");
const talkActive = useSignal(false);
const sendOnTalk = useSignal(false);
const talkStop = (e) => {
if (e) e.preventDefault();
talkActive.value = false;
talkRecognition?.stop();
}
const talk = (e) => {
e.preventDefault();
if (talkRecognition)
talkRecognition.start();
else
alert("Speech recognition is not supported by this browser.");
}
if(talkRecognition) {
talkRecognition.onstart = () => {
talkActive.value = true;
}
talkRecognition.onresult = (e) => {
if (event.results.length > 0) {
message.value = event.results[0][0].transcript;
if (sendOnTalk.value) {
submit(e);
}
}
}
talkRecognition.onspeechend = () => {
talkStop();
}
}
const ttsVoices = useSignal(tts?.getVoices() || []);
const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
if (tts) {
tts.onvoiceschanged = () => {
ttsVoices.value = tts.getVoices();
}
}
const submit = (e) => { const submit = (e) => {
stop(e); stop(e);
@ -624,12 +678,46 @@
value="${message}" value="${message}"
/> />
</div> </div>
<div class="right"> <div class="message-controls">
<button type="submit" disabled=${generating.value}>Send</button> <div> </div>
<button onclick=${uploadImage}>Upload Image</button> <div>
<div>
<button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
<button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button> <button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button> <button onclick=${reset}>Reset</button>
</div> </div>
<div>
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
e.preventDefault();
alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
`(TTS and speech recognition are not provided by llama.cpp)\n` +
`Note: STT requires HTTPS to work.`);
}}>[?]</a>
<button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
<div>
<input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
<label for="send-on-talk" style="line-height: initial;">Send after talking</label>
</div>
</div>
<div>
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
e.preventDefault();
alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
}}>[?]</a>
<label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
<select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
<option value="" selected="${!ttsVoice.value}">None</option>
${[
...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
...ttsVoices.value.filter(v => !v.default),
].map(
v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
)}
</select>
</div>
</div>
</div>
</form> </form>
` `
} }
@ -659,26 +747,86 @@
} }
}, [messages]) }, [messages])
const ttsChatLineActiveIx = useSignal(undefined);
const ttsChatLine = (e, ix, msg) => {
if (e) e.preventDefault();
if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
const ttsVoices = tts.getVoices();
const voice = ttsVoices.find(v => v.name === ttsVoice.value);
if (!voice) return;
if (ttsChatLineActiveIx.value !== undefined) {
tts.cancel();
if (ttsChatLineActiveIx.value === ix) {
ttsChatLineActiveIx.value = undefined;
return;
}
}
ttsChatLineActiveIx.value = ix;
let ttsUtter = new SpeechSynthesisUtterance(msg);
ttsUtter.voice = voice;
ttsUtter.onend = e => {
ttsChatLineActiveIx.value = undefined;
};
tts.speak(ttsUtter);
}
const isCompletionMode = session.value.type === 'completion' const isCompletionMode = session.value.type === 'completion'
// Try play the last bot message
const lastCharChatLinesIxs = useSignal([]);
const lastCharChatLinesIxsOld = useSignal([]);
useEffect(() => {
if (
!isCompletionMode
&& lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
&& !generating.value
) {
const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
if (ix !== undefined) {
const msg = messages[ix];
ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
}
lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
}
}, [generating.value]);
const chatLine = ([user, data], index) => { const chatLine = ([user, data], index) => {
let message let message
const isArrayMessage = Array.isArray(data) const isArrayMessage = Array.isArray(data);
if (params.value.n_probs > 0 && isArrayMessage) {
message = html`<${Probabilities} data=${data} />`
} else {
const text = isArrayMessage ? const text = isArrayMessage ?
data.map(msg => msg.content).join('') : data.map(msg => msg.content).join('') :
data; data;
if (params.value.n_probs > 0 && isArrayMessage) {
message = html`<${Probabilities} data=${data} />`
} else {
message = isCompletionMode ? message = isCompletionMode ?
text : text :
html`<${Markdownish} text=${template(text)} />` html`<${Markdownish} text=${template(text)} />`
} }
const fromBot = user && user === '{{char}}';
if (fromBot && !lastCharChatLinesIxs.value.includes(index))
lastCharChatLinesIxs.value.push(index);
if (user) { if (user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>` return html`
<div>
<p key=${index}><strong>${template(user)}:</strong> ${message}</p>
${
fromBot && ttsVoice.value
&& html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
}
</div>
`;
} else { } else {
return isCompletionMode ? return isCompletionMode ?
html`<span key=${index}>${message}</span>` : html`<span key=${index}>${message}</span>` :
html`<p key=${index}>${message}</p>` html`<div><p key=${index}>${message}</p></div>`
} }
}; };

View file

@ -1,5 +0,0 @@
set(TARGET llama-train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,27 +0,0 @@
# train-text-from-scratch
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# train
./bin/llama-train-text-from-scratch \
--vocab-model ../models/ggml-vocab-llama.gguf \
--ctx 64 --embd 256 --head 8 --layer 16 \
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
--checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
--model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
--train-data "shakespeare.txt" \
-t 6 -b 16 --seed 1 --adam-iter 256 \
--no-checkpointing
# predict
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
```
Output files will be saved every N iterations (config with `--save-every N`).
The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
To train GGUF models just pass them to `--checkpoint-in FN`.

View file

@ -1,499 +0,0 @@
#!/usr/bin/env python3
# train-text-from-scratch checkpoint --> gguf conversion
import argparse
import os
import struct
import sys
import numpy as np
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
import gguf
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
LLM_KV_TRAINING_TYPE = "training.type"
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
class Tensor:
def __init__(self, dtype='f', ne=None):
if ne is None:
ne = []
self.dtype = dtype
self.ne = ne
self.nbytes = 0
if self.dtype == 'f':
if len(self.ne) == 0:
self.nbytes = 0
else:
self.nbytes = int(np.prod(self.ne)) * 4
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
def load(self, data, offset):
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
assert(nd == len(self.ne))
ne = []
for d in range(nd):
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
ne.append(n)
assert(tuple(ne) == tuple(self.ne))
if self.dtype == 'f':
assert(dtype == 0)
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
self.name = bytes(data[offset:offset+namelen]); offset += namelen
# 32-byte alignment
offset += (0 - offset) & 31
self.data = data[offset:offset+self.nbytes]
offset += self.nbytes
return offset
def max_storage_size(self):
result = 0
result += 4 # nd
result += 4 # namelen
result += 4 # dtype
result += len(self.ne)*8 # ne
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
result += 31 # 32-byte alignment
result += self.nbytes
return result
def save_gguf(self, gguf_writer, name):
gguf_writer.add_tensor(
name=name,
tensor=self.data,
raw_shape=np.array(list(reversed(self.ne))),
raw_dtype=gguf.GGMLQuantizationType.F32)
class OptimizationParamsV0:
def __init__(self):
pass
def load(self, data, offset):
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
class OptimizationContext:
def __init__(self):
pass
def load(self, data, offset):
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
offset += 4
if self.version == 0:
params = OptimizationParamsV0()
offset = params.load(data, offset)
self.past = params.past
self.lbfgs_m = params.lbfgs_m
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.type = params.type
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
if self.type == 0:
# these tensors are stored, but we don't need their data
x = Tensor('f', [self.nx])
g = Tensor('f', [self.nx])
g2 = Tensor('f', [self.nx])
mh = Tensor('f', [self.nx])
vh = Tensor('f', [self.nx])
offset = x.load(data, offset)
offset = g.load(data, offset)
offset = g2.load(data, offset)
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = mh.load(data, offset)
offset = vh.load(data, offset)
offset = self.adam_pf.load(data, offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError('Unknown optimizer type')
elif self.version == 1:
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
# forgot to save type in version 1:
# guess self.type from number of remaining bytes
size_type_0 = 12 + sum([t.max_storage_size() for t in
[self.adam_m, self.adam_v]
+([self.adam_pf] if (self.past > 0) else [])])
size_type_1 = 24 + sum([t.max_storage_size() for t in
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
self.lbfgs_lmal, self.lbfgs_lmys,
self.lbfgs_lms, self.lbfgs_lmy]
+([self.lbfgs_pf] if (self.past > 0) else [])])
# due to alignment padding the size might not by exact
# but the difference in size for both types is significant,
# so we can just use whichever is closest
remaining = len(data) - offset
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
self.type = 0
else:
self.type = 1
if self.type == 0:
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = self.adam_pf.load(data,offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError('Invalid version of checkpoint file')
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
if self.type == 0:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
if self.past > 0:
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
elif self.type == 1:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
if self.past > 0:
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
else:
raise ValueError('Unknown optimizer type')
class ModelParams:
def __init__(self):
pass
def load(self, data, offset):
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def get_n_ff(self):
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
def save_gguf(self, gguf_writer):
# self.n_vocab not saved
gguf_writer.add_embedding_length(self.n_embd)
gguf_writer.add_head_count(self.n_head)
gguf_writer.add_block_count(self.n_layer)
gguf_writer.add_rope_dimension_count(self.n_rot)
gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None):
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
class Layer:
def __init__(self, params, bid):
self.bid = bid
self.att_norm = Tensor('f', [params.n_embd])
self.wq = Tensor('f', [params.n_embd, params.n_embd])
self.wk = Tensor('f', [params.n_embd, params.n_embd])
self.wv = Tensor('f', [params.n_embd, params.n_embd])
self.wo = Tensor('f', [params.n_embd, params.n_embd])
self.ffn_norm = Tensor('f', [params.n_embd])
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
def load(self, data, offset):
offset = self.att_norm.load(data, offset)
offset = self.wq.load(data, offset)
offset = self.wk.load(data, offset)
offset = self.wv.load(data, offset)
offset = self.wo.load(data, offset)
offset = self.ffn_norm.load(data, offset)
offset = self.w1.load(data, offset)
offset = self.w2.load(data, offset)
offset = self.w3.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
class Model:
def __init__(self):
self.params = ModelParams()
self.layers = []
def load(self, data, offset):
offset = self.params.load(data, offset)
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
self.norm = Tensor('f', [self.params.n_embd])
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
offset = self.tok_embd.load(data, offset)
offset = self.norm.load(data, offset)
offset = self.output.load(data, offset)
self.layers.clear()
for bid in range(self.params.n_layer):
layer = Layer(self.params, bid)
offset = layer.load(data, offset)
self.layers.append(layer)
return offset
def save_gguf(self, gguf_writer):
self.params.save_gguf(gguf_writer)
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
for layer in self.layers:
layer.save_gguf(gguf_writer)
class Checkpoint:
def __init__(self):
self.model = Model()
self.opt_ctx = OptimizationContext()
def load(self, data, offset):
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
if magic != b'ggcp':
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
if self.version != 0:
raise ValueError('Invalid version of checkpoint file')
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
offset = self.model.load(data, offset)
offset = self.opt_ctx.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
gguf_writer.add_layer_norm_rms_eps(1e-5)
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
self.model.save_gguf(gguf_writer)
self.opt_ctx.save_gguf(gguf_writer)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
return parser.parse_args()
def main():
cfg = handle_args()
data = np.memmap(cfg.input, mode = 'r')
chk = Checkpoint()
offset = 0
offset = chk.load(data, offset)
# we should have read all available data
assert(offset == len(data))
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
chk.save_gguf(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load diff

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1720768451, "lastModified": 1721379653,
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=", "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9", "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -2400,6 +2400,7 @@ extern "C" {
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void); GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks

View file

@ -467,15 +467,18 @@ if (GGML_SYCL)
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif() endif()
if ( NOT DEFINED ENV{ONEAPI_ROOT}) check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") if ( DEFINED ENV{ONEAPI_ROOT})
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
elseif(SUPPORTS_SYCL)
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
source /opt/intel/oneapi/setvars.sh")
else()
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
endif() endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
message(STATUS "SYCL found") message(STATUS "SYCL found")
#todo: AOT
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL) list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
@ -487,11 +490,9 @@ if (GGML_SYCL)
add_compile_definitions(GGML_SYCL_FORCE_MMQ) add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif() endif()
add_compile_options(-I./) #include DPCT set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA") if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32) add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else() else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16) add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
@ -504,14 +505,14 @@ if (GGML_SYCL)
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
if (WIN32) if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else() else()
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (GGML_SYCL_TARGET STREQUAL "INTEL") if (GGML_SYCL_TARGET STREQUAL "INTEL")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif() endif()
endif() endif()

View file

@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__) #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
} }
#endif #endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) { if (svcntw() == 8) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__) #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
} }
#endif #endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) { if (svcntw() == 8) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;

View file

@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
c = __builtin_amdgcn_sdot4(a, b, c, false); c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3) #elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);

View file

@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream); mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
break; break;
case GGML_TYPE_IQ2_XXS:
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream); mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
break; break;
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
mmq_supported = true; mmq_supported = true;

File diff suppressed because it is too large Load diff

View file

@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
TYPES_MMQ = [ TYPES_MMQ = [
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
] ]
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);

View file

@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
return sumi*d8d8 + m8s8 / (QI8_1 / vdr); return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
} }
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
const int * v, const int * u, const float * d8_0, const float & d8_1) {
float sumf = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
int sumi = 0;
#pragma unroll
for (int i = i0; i < i0 + QI8_0/2; ++i) {
// SIMD dot product of quantized values
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
}
sumf += d8_0[i0/(QI8_0/2)]*sumi;
}
return d8_1*sumf;
}
#define VDR_Q2_K_Q8_1_MMVQ 1 #define VDR_Q2_K_Q8_1_MMVQ 1
#define VDR_Q2_K_Q8_1_MMQ 4 #define VDR_Q2_K_Q8_1_MMQ 4

View file

@ -4748,7 +4748,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
} }
#elif defined(__POWER9_VECTOR__) #elif defined(__POWER9_VECTOR__)

View file

@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
queue_ptr stream(int device, int stream) { queue_ptr stream(int device, int stream) {
if (qptrs[device][stream] == nullptr) { if (qptrs[device][stream] == nullptr) {
qptrs[device][stream] = &(dpct::get_current_device().default_queue()); qptrs[device][stream] = &(dpct::get_device(device).default_queue());
} }
return qptrs[device][stream]; return qptrs[device][stream];
} }

View file

@ -734,7 +734,12 @@ namespace dpct
void destroy_queue(sycl::queue queue) { void destroy_queue(sycl::queue queue) {
std::lock_guard<mutex_type> lock(m_mutex); std::lock_guard<mutex_type> lock(m_mutex);
_queues.clear(); _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
[=](const sycl::queue &q) -> bool
{
return q == queue;
}),
_queues.end());
} }
void set_saved_queue(sycl::queue q) { void set_saved_queue(sycl::queue q) {
std::lock_guard<mutex_type> lock(m_mutex); std::lock_guard<mutex_type> lock(m_mutex);
@ -764,13 +769,13 @@ namespace dpct
if (enable_exception_handler) { if (enable_exception_handler) {
eh = exception_handler; eh = exception_handler;
} }
auto q = sycl::queue(*this, eh, _queues.push_back(sycl::queue(
*this, eh,
sycl::property_list( sycl::property_list(
#ifdef DPCT_PROFILING_ENABLED #ifdef DPCT_PROFILING_ENABLED
sycl::property::queue::enable_profiling(), sycl::property::queue::enable_profiling(),
#endif #endif
properties...)); properties...)));
_queues.push_back(q);
return _queues.back(); return _queues.back();
} }
@ -783,8 +788,8 @@ namespace dpct
if (enable_exception_handler) { if (enable_exception_handler) {
eh = exception_handler; eh = exception_handler;
} }
_queues.push_back( _queues.push_back(sycl::queue(
sycl::queue(device, eh, device, eh,
sycl::property_list( sycl::property_list(
#ifdef DPCT_PROFILING_ENABLED #ifdef DPCT_PROFILING_ENABLED
sycl::property::queue::enable_profiling(), sycl::property::queue::enable_profiling(),
@ -855,15 +860,75 @@ namespace dpct
unsigned int get_device_id(const sycl::device &dev) unsigned int get_device_id(const sycl::device &dev)
{ {
unsigned int id = 0; unsigned int id = 0;
for (auto dev_item : _devs) for (auto &dev_item : _devs)
{ {
if (*dev_item == dev) if (*dev_item == dev)
{ {
break; return id;
} }
id++; id++;
} }
return id; return -1;
}
inline std::string get_preferred_gpu_platform_name() {
std::string result;
std::string filter = "level-zero";
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
if (env) {
if (std::strstr(env, "level_zero")) {
filter = "level-zero";
}
else if (std::strstr(env, "opencl")) {
filter = "opencl";
}
else if (std::strstr(env, "cuda")) {
filter = "cuda";
}
else if (std::strstr(env, "hip")) {
filter = "hip";
}
else {
throw std::runtime_error("invalid device filter: " + std::string(env));
}
}
auto plaform_list = sycl::platform::get_platforms();
for (const auto& platform : plaform_list) {
auto devices = platform.get_devices();
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
return d.is_gpu();
});
if (gpu_dev == devices.end()) {
// cout << "platform [" << platform_name
// << "] does not contain GPU devices, skipping\n";
continue;
}
auto platform_name = platform.get_info<sycl::info::platform::name>();
std::string platform_name_low_case;
platform_name_low_case.resize(platform_name.size());
std::transform(
platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
if (platform_name_low_case.find(filter) == std::string::npos) {
// cout << "platform [" << platform_name
// << "] does not match with requested "
// << filter << ", skipping\n";
continue;
}
result = platform_name;
}
if (result.empty())
throw std::runtime_error("can not find preferred GPU platform");
return result;
} }
template <class DeviceSelector> template <class DeviceSelector>
@ -930,10 +995,15 @@ namespace dpct
// Keep track of the number of devices per backend // Keep track of the number of devices per backend
std::map<sycl::backend, size_t> DeviceNums; std::map<sycl::backend, size_t> DeviceNums;
std::map<std::string, std::vector<sycl::device>> backend_devices; std::map<std::string, std::vector<sycl::device>> backend_devices;
auto preferred_platform_name = get_preferred_gpu_platform_name();
while (!Platforms.empty()) { while (!Platforms.empty()) {
auto Platform = Platforms.back(); auto Platform = Platforms.back();
Platforms.pop_back(); Platforms.pop_back();
auto platform_name = Platform.get_info<sycl::info::platform::name>();
if (platform_name.compare(preferred_platform_name) != 0) {
continue;
}
auto devices = Platform.get_devices(); auto devices = Platform.get_devices();
std::string backend_type = get_device_backend_and_type(devices[0]); std::string backend_type = get_device_backend_and_type(devices[0]);
for (const auto &device : devices) { for (const auto &device : devices) {
@ -1989,6 +2059,11 @@ namespace dpct
return dev_mgr::instance().current_device(); return dev_mgr::instance().current_device();
} }
static inline device_ext &get_device(unsigned int id)
{
return dev_mgr::instance().get_device(id);
}
static inline sycl::queue &get_in_order_queue() static inline sycl::queue &get_in_order_queue()
{ {
return dev_mgr::instance().current_device().in_order_queue(); return dev_mgr::instance().current_device().in_order_queue();

View file

@ -152,7 +152,8 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
const sycl::range<3> block_dims(1, 1, nth); const sycl::range<3> block_dims(1, 1, nth);
const sycl::range<3> block_nums(1, 1, nrows_x); const sycl::range<3> block_nums(1, 1, nrows_x);
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE); const size_t n_val_tmp = nth / WARP_SIZE;
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));

View file

@ -38,8 +38,6 @@
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
#define VK_NUM_TYPES 16
#define GGML_VK_MAX_NODES 8192 #define GGML_VK_MAX_NODES 8192
#define MAX_VK_BUFFERS 256 #define MAX_VK_BUFFERS 256
@ -162,23 +160,23 @@ struct vk_device_struct {
vk_matmul_pipeline pipeline_matmul_f16_f32; vk_matmul_pipeline pipeline_matmul_f16_f32;
vk_pipeline pipeline_matmul_split_k_reduce; vk_pipeline pipeline_matmul_split_k_reduce;
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES]; vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
vk_matmul_pipeline pipeline_matmul_id_f32; vk_matmul_pipeline pipeline_matmul_id_f32;
vk_matmul_pipeline pipeline_matmul_id_f16; vk_matmul_pipeline pipeline_matmul_id_f16;
vk_matmul_pipeline pipeline_matmul_id_f16_f32; vk_matmul_pipeline pipeline_matmul_id_f16_f32;
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES]; vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
vk_pipeline pipeline_dequant[VK_NUM_TYPES]; vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES]; vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES]; vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES]; vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_mul_f32; vk_pipeline pipeline_mul_f32;
vk_pipeline pipeline_div_f32; vk_pipeline pipeline_div_f32;
vk_pipeline pipeline_add_f32; vk_pipeline pipeline_add_f32;
@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
); );
} }
static bool ggml_vk_build_shader(ggml_type type) {
switch(type) {
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return true;
default:
return false;
}
}
static void ggml_vk_load_shaders(vk_device& device) { static void ggml_vk_load_shaders(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
if (device->fp16) { if (device->fp16) {
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
} else { } else {
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
} }
// mul mat vec // mul mat vec
@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
// dequant shaders // dequant shaders
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
// get_rows // get_rows
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return nullptr; return nullptr;
@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return nullptr; return nullptr;
@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return nullptr; return nullptr;
@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return nullptr; return nullptr;
@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return nullptr; return nullptr;
@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const uint64_t nei0 = ids->ne[0]; const uint64_t nei0 = ids->ne[0];
const uint64_t nei1 = ids->ne[1]; const uint64_t nei1 = ids->ne[1];
GGML_ASSERT(nei0 * nei1 <= 2048); GGML_ASSERT(nei0 * nei1 <= 3072);
const uint32_t nbi1 = ids->nb[1]; const uint32_t nbi1 = ids->nb[1];
const uint32_t nbi2 = ids->nb[2]; const uint32_t nbi2 = ids->nb[2];
@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const uint64_t n_as = ne02; const uint64_t n_as = ne02;
GGML_ASSERT(n_as <= 8);
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} }
} }
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it); ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
if (split_k > 1) { if (split_k > 1) {
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer // Resize buffer
if (ctx->prealloc_split_k != nullptr) { if (ctx->prealloc_split_k != nullptr) {
ggml_vk_destroy_buffer(ctx->prealloc_split_k); ggml_vk_destroy_buffer(ctx->prealloc_split_k);
} }
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
} }
} }
vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} }
} }
ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) { for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx, subctx); ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul( ggml_vk_matmul(
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
m, n, k, m, n, k,
@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
// copy dst to host // copy dst to host
ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne); ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
float * d_chk = (float *) malloc(sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne);
@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
if (split_k > 1) { if (split_k > 1) {
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
std::cerr << "d_buf0: " << std::endl << std::endl; std::cerr << "d_buf0: " << std::endl << std::endl;
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
free(d_chk); free(d_chk);
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue); ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_X);
ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_Y);
@ -4834,90 +4852,23 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
} }
} }
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
// Check transfers are correct
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
float * x;
float * y;
if (pinned) {
x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
} else {
x = (float *) malloc(sizeof(float) * ne);
y = (float *) malloc(sizeof(float) * ne);
}
for (size_t i = 0; i < ne; i++) {
x[i] = rand() / (float)RAND_MAX;
}
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ggml_vk_ctx_begin(ctx, subctx);
auto begin = std::chrono::high_resolution_clock::now();
ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
subctx->in_memcpys.clear();
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
ctx->device->device.resetFences({ ctx->fence });
auto end = std::chrono::high_resolution_clock::now();
double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
ggml_vk_ctx_begin(ctx, subctx);
begin = std::chrono::high_resolution_clock::now();
ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
ctx->device->device.resetFences({ ctx->fence });
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
subctx->out_memcpys.clear();
end = std::chrono::high_resolution_clock::now();
double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
double avg_err = 0.0;
for (size_t i = 0; i < ne; i++) {
avg_err += std::fabs(x[i] - y[i]);
}
double kb = ne * sizeof(float) / 1024.0;
std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
ggml_vk_destroy_buffer(buffer);
if (pinned) {
ggml_vk_host_free(ctx, x);
ggml_vk_host_free(ctx, y);
} else {
free(x);
free(y);
}
}
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
} }
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
if (quant == GGML_TYPE_F32) {
memcpy(to, from, sizeof(float) * ne);
return;
}
ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
ggml_to_float_t dequant_fn = tt.to_float;
dequant_fn(from, to, ne);
}
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
const size_t x_sz = sizeof(float) * ne; const size_t x_sz = sizeof(float) * ne;
@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
float * x = (float *) malloc(x_sz); float * x = (float *) malloc(x_sz);
void * qx = malloc(qx_sz); void * qx = malloc(qx_sz);
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
float * x_ref = (float *) malloc(x_sz);
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
for (size_t i = 0; i < ne; i++) { for (size_t i = 0; i < ne; i++) {
x[i] = rand() / (float)RAND_MAX; x[i] = rand() / (float)RAND_MAX;
} }
vk_pipeline p = ctx->device->pipeline_dequant[quant]; vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
ggml_vk_quantize_data(x, qx, ne, quant); ggml_vk_quantize_data(x, qx, ne, quant);
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
ggml_pipeline_allocate_descriptor_sets(ctx, p, 1); ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ggml_vk_ctx_begin(ctx, subctx); ggml_vk_ctx_begin(ctx->device, subctx);
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
ggml_vk_ctx_end(subctx); ggml_vk_ctx_end(subctx);
@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16); ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
int first_err = -1; int first_err = -1;
double avg_err = 0.0; double avg_err = 0.0;
for (size_t i = 0; i < ne; i++) { for (size_t i = 0; i < ne; i++) {
double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i])); double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
avg_err += error; avg_err += error;
if (first_err < 0 && error > 0.05) { if (first_err < 0 && error > 0.05) {
@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
} }
std::cerr << std::endl << "Expected result: " << std::endl << std::endl; std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
std::cerr << x[i] << ", "; std::cerr << x_ref[i] << ", ";
} }
std::cerr << std::endl; std::cerr << std::endl;
} }
@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
free(x); free(x);
free(qx); free(qx);
free(x_ref);
free(x_chk); free(x_chk);
} }
@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
float * x = (float *) malloc(x_sz); float * x = (float *) malloc(x_sz);
float * y = (float *) malloc(y_sz); float * y = (float *) malloc(y_sz);
void * qx = malloc(qx_sz); void * qx = malloc(qx_sz);
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
float * d = (float *) malloc(d_sz); float * d = (float *) malloc(d_sz);
float * d_chk = (float *) malloc(d_sz); float * d_chk = (float *) malloc(d_sz);
@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
y[i] = (i % k == i / k) ? 1.0f : 0.0f; y[i] = (i % k == i / k) ? 1.0f : 0.0f;
} }
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it); ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
if (split_k > 1) { if (split_k > 1) {
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer // Resize buffer
if (ctx->prealloc_split_k != nullptr) { if (ctx->prealloc_split_k != nullptr) {
ggml_vk_destroy_buffer(ctx->prealloc_split_k); ggml_vk_destroy_buffer(ctx->prealloc_split_k);
} }
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
} }
} }
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz);
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) { for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx, subctx); ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul( ggml_vk_matmul(
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
m, n, k, m, n, k,
@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz); ggml_vk_buffer_read(d_buf, 0, d, d_sz);
ggml_init_params iparams = { ggml_init_params iparams = {
/*.mem_size =*/ 1024*1024*1024, /*.mem_size =*/ 1024*1024*1024,
@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
if (split_k > 1) { if (split_k > 1) {
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
std::cerr << "d_buf0: " << std::endl << std::endl; std::cerr << "d_buf0: " << std::endl << std::endl;
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
#if defined(GGML_VULKAN_RUN_TESTS) #if defined(GGML_VULKAN_RUN_TESTS)
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2); ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0); ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1); ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2); ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0); // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1); // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2); // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K); // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
std::cerr << std::endl; std::cerr << std::endl;
@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
std::cerr << std::endl; std::cerr << std::endl;
} }
@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
break; break;
default: default:
return false; return false;
@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return true; return true;
default: default:
return false; return false;

View file

@ -21015,7 +21015,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
gguf_tensor_info_sanitize(info); gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names // make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) { for (uint64_t j = 0; j < i && ok; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) { if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data); fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false; ok = false;
@ -21096,6 +21096,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
}; };
*params.ctx = ggml_init(pdata); *params.ctx = ggml_init(pdata);
if (*params.ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
fclose(file);
gguf_free(ctx);
return NULL;
}
struct ggml_context * ctx_data = *params.ctx; struct ggml_context * ctx_data = *params.ctx;
@ -22005,6 +22011,14 @@ int ggml_cpu_has_cann(void) {
#endif #endif
} }
int ggml_cpu_has_llamafile(void) {
#if defined(GGML_USE_LLAMAFILE)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
} }

View file

@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d; return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
} }
#endif #endif
#if defined(DATA_A_IQ4_NL)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
}
#endif

View file

@ -0,0 +1,30 @@
#version 450
#include "dequant_head.comp"
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() {
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
const uint tid = gl_LocalInvocationID.x % 64;
const uint il = tid/32;
const uint ir = tid%32;
const uint ib = 32*i + ir;
if (ib >= p.nel / 32) {
return;
}
const uint q_idx = 8*il;
const uint b_idx = 1024*i + 32*ir + q_idx;
const float d = float(data_a[ib].d);
[[unroll]] for (uint l = 0; l < 8; ++l) {
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
}
}

View file

@ -18,15 +18,13 @@ void main() {
return; return;
} }
const uint b_idx = 1024*i + 32*ir + 8*il; const uint q_idx = 8*il;
const uint b_idx = 1024*i + 32*ir + q_idx;
const float d = float(data_a[ib].d); const float d = float(data_a[ib].d);
const float dm = -8.0f * d;
const uint q_idx = 8*il;
[[unroll]] for (uint l = 0; l < 8; ++l) { [[unroll]] for (uint l = 0; l < 8; ++l) {
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm); data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + dm); data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
} }
} }

View file

@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
shared FLOAT_TYPE buf_b[BN * (BK+1)]; shared FLOAT_TYPE buf_b[BN * (BK+1)];
#ifdef MUL_MAT_ID #ifdef MUL_MAT_ID
shared u16vec2 row_ids[2048]; shared u16vec2 row_ids[3072];
#endif #endif
void main() { void main() {
@ -380,6 +380,19 @@ void main() {
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32)); buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
#elif defined(DATA_A_IQ4_NL)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
const uint ib = idx / 16;
const uint iqs = idx & 0xF;
const float d = float(data_a[ib].d);
const uint vui = uint(data_a[ib].qs[iqs]);
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
#endif #endif
} }
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {

View file

@ -177,3 +177,24 @@ struct block_q6_K
#define A_TYPE block_q6_K #define A_TYPE block_q6_K
#endif #endif
// IQuants
#if defined(DATA_A_IQ4_NL)
#extension GL_EXT_shader_16bit_storage : require
#define QUANT_K 32
#define QUANT_R 2
struct block_iq4_nl
{
float16_t d;
uint8_t qs[QUANT_K/2];
};
#define A_TYPE block_iq4_nl
const int8_t kvalues_iq4nl[16] = {
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
};
#endif

View file

@ -52,7 +52,8 @@ const std::vector<std::string> type_names = {
"q3_k", "q3_k",
"q4_k", "q4_k",
"q5_k", "q5_k",
"q6_k" "q6_k",
"iq4_nl"
}; };
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) { void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {

View file

@ -54,6 +54,7 @@ class Metadata:
model_card = Metadata.load_model_card(model_path) model_card = Metadata.load_model_card(model_path)
hf_params = Metadata.load_hf_parameters(model_path) hf_params = Metadata.load_hf_parameters(model_path)
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
# heuristics # heuristics
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params) metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
@ -177,6 +178,12 @@ class Metadata:
org_component = None org_component = None
name_parts: list[str] = model_full_name_component.split('-') name_parts: list[str] = model_full_name_component.split('-')
# Remove empty parts
for i in reversed(range(len(name_parts))):
if len(name_parts[i]) == 0:
del name_parts[i]
name_types: list[ name_types: list[
set[Literal["basename", "size_label", "finetune", "version", "type"]] set[Literal["basename", "size_label", "finetune", "version", "type"]]
] = [set() for _ in name_parts] ] = [set() for _ in name_parts]
@ -223,9 +230,19 @@ class Metadata:
name_parts[i] = part name_parts[i] = part
# Some easy to recognize finetune names # Some easy to recognize finetune names
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE): elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
if total_params < 0 and part.lower() == "lora":
# ignore redundant "lora" in the finetune part when the output is a lora adapter
name_types[i].add("type")
else:
name_types[i].add("finetune") name_types[i].add("finetune")
if part.lower() == "lora":
name_parts[i] = "LoRA" # Ignore word-based size labels when there is at least a number-based one present
# TODO: should word-based size labels always be removed instead?
if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
for n, t in zip(name_parts, name_types):
if "size_label" in t:
if all(c.isalpha() for c in n):
t.remove("size_label")
at_start = True at_start = True
# Find the basename through the annotated name # Find the basename through the annotated name
@ -240,18 +257,18 @@ class Metadata:
# Remove the basename annotation from trailing version # Remove the basename annotation from trailing version
for part, t in zip(reversed(name_parts), reversed(name_types)): for part, t in zip(reversed(name_parts), reversed(name_types)):
if "basename" in t: if "basename" in t and len(t) > 1:
if len(t) > 1:
t.remove("basename") t.remove("basename")
else: else:
break break
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
# TODO: should the basename version always be excluded? # TODO: should the basename version always be excluded?
# TODO: should multiple versions be joined together? # NOTE: multiple finetune versions are joined together
version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1] version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
if size_label is None and finetune is None and version is None: if size_label is None and finetune is None and version is None:
# Too ambiguous, output nothing # Too ambiguous, output nothing

View file

@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
if base_name is not None: if base_name is not None:
name = base_name.strip().title().replace(' ', '-').replace('/', '-') name = base_name.strip().replace(' ', '-').replace('/', '-')
elif model_name is not None: elif model_name is not None:
name = model_name.strip().title().replace(' ', '-').replace('/', '-') name = model_name.strip().replace(' ', '-').replace('/', '-')
else: else:
name = "ggml-model" name = "ggml-model"
parameters = f"-{size_label}" if size_label is not None else "" parameters = f"-{size_label}" if size_label is not None else ""
finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else "" finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else "" version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""

View file

@ -4,6 +4,7 @@ from __future__ import annotations
import logging import logging
import argparse import argparse
import os import os
import re
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
else: else:
pretty_type = str(field.types[-1].name) pretty_type = str(field.types[-1].name)
def escape_markdown_inline_code(value_string):
# Find the longest contiguous sequence of backticks in the string then
# wrap string with appropriate number of backticks required to escape it
max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
inline_code_marker = '`' * (max_backticks + 1)
# If the string starts or ends with a backtick, add a space at the beginning and end
if value_string.startswith('`') or value_string.endswith('`'):
value_string = f" {value_string} "
return f"{inline_code_marker}{value_string}{inline_code_marker}"
total_elements = len(field.data) total_elements = len(field.data)
value = "" value = ""
if len(field.types) == 1: if len(field.types) == 1:
curr_type = field.types[0] curr_type = field.types[0]
if curr_type == GGUFValueType.STRING: if curr_type == GGUFValueType.STRING:
value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]) truncate_length = 60
value_string = str(bytes(field.parts[-1]), encoding='utf-8')
if len(value_string) > truncate_length:
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
value = "{head}...{tail}".format(head=head, tail=tail)
else:
value = escape_markdown_inline_code(value_string)
elif curr_type in reader.gguf_scalar_to_np: elif curr_type in reader.gguf_scalar_to_np:
value = str(field.parts[-1][0]) value = str(field.parts[-1][0])
else: else:
if field.types[0] == GGUFValueType.ARRAY: if field.types[0] == GGUFValueType.ARRAY:
curr_type = field.types[1] curr_type = field.types[1]
array_elements = []
if curr_type == GGUFValueType.STRING: if curr_type == GGUFValueType.STRING:
render_element = min(5, total_elements) render_element = min(5, total_elements)
for element_pos in range(render_element): for element_pos in range(render_element):
value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "") truncate_length = 30
value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
if len(value_string) > truncate_length:
head = escape_markdown_inline_code(value_string[:truncate_length // 2])
tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
value = "{head}...{tail}".format(head=head, tail=tail)
else:
value = escape_markdown_inline_code(value_string)
array_elements.append(value)
elif curr_type in reader.gguf_scalar_to_np: elif curr_type in reader.gguf_scalar_to_np:
render_element = min(7, total_elements) render_element = min(7, total_elements)
for element_pos in range(render_element): for element_pos in range(render_element):
value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "") array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value}) kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
kv_dump_table_header_map = [ kv_dump_table_header_map = [

View file

@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"), self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B')) ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
# Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing... # Non standard naming
self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"), self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B')) ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3), self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K')) ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
# None standard and not easy to disambiguate # Non standard and not easy to disambiguate
self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"), self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None)) ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"), self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B')) ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
# Ignore full-text size labels when there are number-based ones, and deduplicate size labels
self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
# Instruct in a name without a size label
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
# Non-obvious splitting relying on 'chat' keyword
self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
# Multiple versions
self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
# TODO: DPO in the name
self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
# DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
# Too ambiguous
# TODO: should "base" be a 'finetune' or 'size_label'?
# (in this case it should be a size label, but other models use it to signal that they are not finetuned)
self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
('Florence-2-base', 'microsoft', None, None, None, None))
## Invalid cases ##
# Start with a dash and has dashes in rows
self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
## LoRA ##
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
# Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
def test_apply_metadata_heuristic_from_model_card(self): def test_apply_metadata_heuristic_from_model_card(self):
model_card = { model_card = {
'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'], 'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
} }
got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
expect = gguf.Metadata() expect = gguf.Metadata()
expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}] expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'] expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
expect.languages=['en'] expect.languages=['en']
expect.datasets=['teknium/OpenHermes-2.5'] expect.datasets=['teknium/OpenHermes-2.5']

View file

@ -92,6 +92,9 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18, LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19, LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
}; };
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope
@ -526,12 +529,16 @@ extern "C" {
struct llama_lora_adapter * adapter, struct llama_lora_adapter * adapter,
float scale); float scale);
// Remove a LoRA adapter from given context // Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context // Return -1 if the adapter is not present in the context
LLAMA_API int32_t llama_lora_adapter_remove( LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx, struct llama_context * ctx,
struct llama_lora_adapter * adapter); struct llama_lora_adapter * adapter);
// Remove all LoRA adapters from given context
LLAMA_API void llama_lora_adapter_clear(
struct llama_context * ctx);
// Manually free a LoRA adapter // Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted // Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
@ -962,6 +969,10 @@ extern "C" {
bool remove_special, bool remove_special,
bool unparse_special); bool unparse_special);
//
// Chat templates
//
/// Apply chat template. Inspired by hf apply_chat_template() on python. /// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@ -1000,6 +1011,23 @@ extern "C" {
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar); LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
/// @details Apply constraints from grammar
LLAMA_API void llama_grammar_sample(
const struct llama_grammar * grammar,
const struct llama_context * ctx,
llama_token_data_array * candidates);
LLAMA_API DEPRECATED(void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar),
"use llama_grammar_sample instead");
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_grammar * grammar,
struct llama_context * ctx,
llama_token token);
// //
// Sampling functions // Sampling functions
// //
@ -1081,12 +1109,6 @@ extern "C" {
llama_token_data_array * candidates, llama_token_data_array * candidates,
float temp); float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -1124,12 +1146,6 @@ extern "C" {
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates); llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);
// //
// Model split // Model split
// //
@ -1172,34 +1188,41 @@ extern "C" {
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
struct llama_partial_utf8 { struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted) uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence int n_remain; // num bytes remaining; -1 indicates invalid sequence
}; };
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate { struct llama_grammar_candidate {
size_t index; size_t index;
const uint32_t * code_points; const uint32_t * code_points;
llama_partial_utf8 partial_utf8; llama_partial_utf8 partial_utf8;
}; };
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map( using llama_grammar_rule = std::vector< llama_grammar_element>;
struct llama_context * ctx using llama_grammar_stack = std::vector<const llama_grammar_element *>;
);
using llama_grammar_rules = std::vector<llama_grammar_rule>;
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
void llama_grammar_accept( void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules, const llama_grammar_rules & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks, const llama_grammar_stacks & stacks,
const uint32_t chr, const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks); llama_grammar_stacks & new_stacks);
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
const llama_grammar_candidates & candidates);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src, const std::string & src,

Binary file not shown.

Binary file not shown.

View file

@ -14,6 +14,9 @@ endif()
add_library(llama add_library(llama
../include/llama.h ../include/llama.h
llama.cpp llama.cpp
llama-vocab.cpp
llama-grammar.cpp
llama-sampling.cpp
unicode.h unicode.h
unicode.cpp unicode.cpp
unicode-data.cpp unicode-data.cpp

539
src/llama-grammar.cpp Normal file
View file

@ -0,0 +1,539 @@
#include "llama-grammar.h"
#include "llama-vocab.h"
#include "llama-sampling.h"
#include <algorithm>
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
const char * pos = src.c_str();
std::vector<uint32_t> code_points;
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
code_points.reserve(src.size() + 1);
uint32_t value = partial_start.value;
int n_remain = partial_start.n_remain;
// continue previous decode, if applicable
while (*pos != 0 && n_remain > 0) {
uint8_t next_byte = static_cast<uint8_t>(*pos);
if ((next_byte >> 6) != 2) {
// invalid sequence, abort
code_points.push_back(0);
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
}
value = (value << 6) + (next_byte & 0x3F);
++pos;
--n_remain;
}
if (partial_start.n_remain > 0 && n_remain == 0) {
code_points.push_back(value);
}
// decode any subsequent utf-8 sequences, which may end in an incomplete one
while (*pos != 0) {
uint8_t first_byte = static_cast<uint8_t>(*pos);
uint8_t highbits = first_byte >> 4;
n_remain = lookup[highbits] - 1;
if (n_remain < 0) {
// invalid sequence, abort
code_points.clear();
code_points.push_back(0);
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
}
uint8_t mask = (1 << (7 - n_remain)) - 1;
value = first_byte & mask;
++pos;
while (*pos != 0 && n_remain > 0) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
++pos;
--n_remain;
}
if (n_remain == 0) {
code_points.push_back(value);
}
}
code_points.push_back(0);
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
}
const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
return grammar->rules;
}
llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
return grammar->stacks;
}
// returns true iff pos points to the end of one of the definitions of a rule
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
switch (pos->type) {
case LLAMA_GRETYPE_END: return true; // NOLINT
case LLAMA_GRETYPE_ALT: return true; // NOLINT
default: return false;
}
}
// returns true iff chr satisfies the char range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
const llama_grammar_element * pos,
const uint32_t chr) {
bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
do {
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
// inclusive range, e.g. [a-z]
found = found || (pos->value <= chr && chr <= pos[1].value);
pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
found = true;
pos += 1;
} else {
// exact char match, e.g. [a] or "a"
found = found || pos->value == chr;
pos += 1;
}
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
return std::make_pair(found == is_positive_char, pos);
}
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
// range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static bool llama_grammar_match_partial_char(
const llama_grammar_element * pos,
const llama_partial_utf8 partial_utf8) {
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
uint32_t partial_value = partial_utf8.value;
int n_remain = partial_utf8.n_remain;
// invalid sequence or 7-bit char split across 2 bytes (overlong)
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
return false;
}
// range of possible code points this partial UTF-8 sequence could complete to
uint32_t low = partial_value << (n_remain * 6);
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
if (low == 0) {
if (n_remain == 2) {
low = 1 << 11;
} else if (n_remain == 3) {
low = 1 << 16;
}
}
do {
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
// inclusive range, e.g. [a-z]
if (pos->value <= high && low <= pos[1].value) {
return is_positive_char;
}
pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
return true;
} else {
// exact char match, e.g. [a] or "a"
if (low <= pos->value && pos->value <= high) {
return is_positive_char;
}
pos += 1;
}
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
return !is_positive_char;
}
// transforms a grammar pushdown stack into N possible stacks, all ending
// at a character range (terminal element)
static void llama_grammar_advance_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
llama_grammar_stacks & new_stacks) {
if (stack.empty()) {
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
new_stacks.emplace_back(stack);
}
return;
}
const llama_grammar_element * pos = stack.back();
switch (pos->type) {
case LLAMA_GRETYPE_RULE_REF: {
const size_t rule_id = static_cast<size_t>(pos->value);
const llama_grammar_element * subpos = rules[rule_id].data();
do {
// init new stack without the top (pos)
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
// if this rule ref is followed by another element, add that to stack
new_stack.push_back(pos + 1);
}
if (!llama_grammar_is_end_of_sequence(subpos)) {
// if alternate is nonempty, add to stack
new_stack.push_back(subpos);
}
llama_grammar_advance_stack(rules, new_stack, new_stacks);
while (!llama_grammar_is_end_of_sequence(subpos)) {
// scan to end of alternate def
subpos++;
}
if (subpos->type == LLAMA_GRETYPE_ALT) {
// there's another alternate def of this rule to process
subpos++;
} else {
break;
}
} while (true);
break;
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_ANY:
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack);
}
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those
GGML_ASSERT(false);
}
}
// takes a set of possible pushdown stacks on a grammar, which are required to
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
void llama_grammar_accept(
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
const uint32_t chr,
llama_grammar_stacks & new_stacks) {
new_stacks.clear();
for (const auto & stack : stacks) {
if (stack.empty()) {
continue;
}
auto match = llama_grammar_match_char(stack.back(), chr);
if (match.first) {
const llama_grammar_element * pos = match.second;
// update top of stack to next element, if any
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos)) {
new_stack.push_back(pos);
}
llama_grammar_advance_stack(rules, new_stack, new_stacks);
}
}
}
static llama_grammar_candidates llama_grammar_reject_candidates(
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
const llama_grammar_candidates & candidates) {
GGML_ASSERT(!stacks.empty()); // REVIEW
if (candidates.empty()) {
return {};
}
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
}
return rejects;
}
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
const llama_grammar_candidates & candidates) {
llama_grammar_candidates rejects;
rejects.reserve(candidates.size());
if (stack.empty()) {
for (const auto & tok : candidates) {
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
rejects.push_back(tok);
}
}
return rejects;
}
const llama_grammar_element * stack_pos = stack.back();
llama_grammar_candidates next_candidates;
next_candidates.reserve(candidates.size());
for (const auto & tok : candidates) {
if (*tok.code_points == 0) {
// reached end of full codepoints in token, reject iff it ended in a partial sequence
// that cannot satisfy this position in grammar
if (tok.partial_utf8.n_remain != 0 &&
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
rejects.push_back(tok);
}
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
} else {
rejects.push_back(tok);
}
}
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
// update top of stack to next element, if any
llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
stack_after.push_back(stack_pos_after);
}
llama_grammar_stacks next_stacks;
llama_grammar_advance_stack(rules, stack_after, next_stacks);
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
for (const auto & tok : next_rejects) {
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
}
return rejects;
}
static bool llama_grammar_detect_left_recursion(
const llama_grammar_rules & rules,
size_t rule_index,
std::vector<bool> * rules_visited,
std::vector<bool> * rules_in_progress,
std::vector<bool> * rules_may_be_empty) {
if ((*rules_in_progress)[rule_index]) {
return true;
}
(*rules_in_progress)[rule_index] = true;
const llama_grammar_rule & rule = rules[rule_index];
// First check if the rule might produce the empty string. This could be done combined with the second
// step but it's more readable as two steps.
bool at_rule_start = true;
for (size_t i = 0; i < rule.size(); i++) {
if (llama_grammar_is_end_of_sequence(&rule[i])) {
if (at_rule_start) {
(*rules_may_be_empty)[rule_index] = true;
break;
}
at_rule_start = true;
} else {
at_rule_start = false;
}
}
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
// be empty)
bool recurse_into_nonterminal = true;
for (size_t i = 0; i < rule.size(); i++) {
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
return true;
}
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
recurse_into_nonterminal = false;
}
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
recurse_into_nonterminal = true;
} else {
recurse_into_nonterminal = false;
}
}
(*rules_in_progress)[rule_index] = false;
(*rules_visited)[rule_index] = true;
return false;
}
//
// grammar - external
//
struct llama_grammar * llama_grammar_init_impl(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index) {
const llama_grammar_element * pos;
// copy rule definitions into vectors
llama_grammar_rules vec_rules(n_rules);
for (size_t i = 0; i < n_rules; i++) {
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
vec_rules[i].push_back(*pos);
}
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
}
// Check for left recursion
std::vector<bool> rules_visited(n_rules);
std::vector<bool> rules_in_progress(n_rules);
std::vector<bool> rules_may_be_empty(n_rules);
for (size_t i = 0; i < n_rules; i++) {
if (rules_visited[i]) {
continue;
}
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
return nullptr;
}
}
// loop over alternates of start rule to build initial stacks
llama_grammar_stacks stacks;
pos = vec_rules[start_rule_index].data();
do {
llama_grammar_stack stack;
if (!llama_grammar_is_end_of_sequence(pos)) {
// if alternate is nonempty, add to stack
stack.push_back(pos);
}
llama_grammar_advance_stack(vec_rules, stack, stacks);
while (!llama_grammar_is_end_of_sequence(pos)) {
// scan to end of alternate def
pos++;
}
if (pos->type == LLAMA_GRETYPE_ALT) {
// there's another alternate def of this rule to process
pos++;
} else {
break;
}
} while (true);
// Important: vec_rules has to be moved here, not copied, because stacks contains
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
}
void llama_grammar_free_impl(struct llama_grammar * grammar) {
delete grammar;
}
struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
// redirect elements in stacks to point to new rules
for (size_t is = 0; is < result->stacks.size(); is++) {
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
result->stacks[is][ie] = &result->rules[ir0][ir1];
}
}
}
}
}
return result;
}
void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
GGML_ASSERT(grammar);
GGML_ASSERT(vocab);
int64_t t_start_sample_us = ggml_time_us();
bool allow_eog = false;
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
allow_eog = true;
break;
}
}
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
candidates_decoded.reserve(candidates->size);
llama_grammar_candidates candidates_grammar;
candidates_grammar.reserve(candidates->size);
for (size_t i = 0; i < candidates->size; ++i) {
const llama_token id = candidates->data[i].id;
const std::string & piece = vocab->cache_token_to_piece.at(id);
if (llama_token_is_eog_impl(*vocab, id)) {
if (!allow_eog) {
candidates->data[i].logit = -INFINITY;
}
} else if (piece.empty() || piece[0] == 0) {
candidates->data[i].logit = -INFINITY;
} else {
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
}
}
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
for (const auto & reject : rejects) {
candidates->data[reject.index].logit = -INFINITY;
}
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
const int64_t t_start_sample_us = ggml_time_us();
if (llama_token_is_eog_impl(*vocab, token)) {
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return;
}
}
GGML_ASSERT(false);
}
const std::string & piece = vocab->cache_token_to_piece.at(token);
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
const auto & code_points = decoded.first;
llama_grammar_stacks tmp_new_stacks;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
grammar->stacks = tmp_new_stacks;
}
grammar->partial_utf8 = decoded.second;
GGML_ASSERT(!grammar->stacks.empty());
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}

39
src/llama-grammar.h Normal file
View file

@ -0,0 +1,39 @@
#pragma once
#include "llama-impl.h"
struct llama_vocab;
struct llama_sampling;
struct llama_grammar {
const llama_grammar_rules rules;
llama_grammar_stacks stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
//
// internal API
//
struct llama_grammar * llama_grammar_init_impl(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
void llama_grammar_free_impl(struct llama_grammar * grammar);
struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
void llama_grammar_sample_impl(
const struct llama_grammar * grammar,
const struct llama_vocab * vocab,
const struct llama_sampling * smpl,
llama_token_data_array * candidates);
void llama_grammar_accept_token_impl(
struct llama_grammar * grammar,
const struct llama_vocab * vocab,
const struct llama_sampling * smpl,
llama_token token);

26
src/llama-impl.h Normal file
View file

@ -0,0 +1,26 @@
#pragma once
#define LLAMA_API_INTERNAL
#include "llama.h"
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
//
// logging
//
LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)

635
src/llama-sampling.cpp Normal file
View file

@ -0,0 +1,635 @@
#include "llama-sampling.h"
#include <algorithm>
#include <cstring>
#include <ctime>
#include <cfloat>
#include <numeric>
#include <unordered_map>
static void llama_log_softmax(float * array, size_t size) {
float max_l = *std::max_element(array, array + size);
float sum = 0.f;
for (size_t i = 0; i < size; ++i) {
float p = expf(array[i] - max_l);
sum += p;
array[i] = p;
}
for (size_t i = 0; i < size; ++i) {
array[i] = logf(array[i] / sum);
}
}
void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
smpl->rng.seed(seed);
}
void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
GGML_ASSERT(candidates->size > 0);
const int64_t t_start_sample_us = ggml_time_us();
// Sort the logits in descending order
if (!candidates->sorted) {
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
candidates->sorted = true;
}
float max_l = candidates->data[0].logit;
float cum_sum = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
float p = expf(candidates->data[i].logit - max_l);
candidates->data[i].p = p;
cum_sum += p;
}
for (size_t i = 0; i < candidates->size; ++i) {
candidates->data[i].p /= cum_sum;
}
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
// if (k >= (int32_t)candidates->size) {
// return;
// }
const int64_t t_start_sample_us = ggml_time_us();
if (k <= 0) {
k = candidates->size;
}
k = std::max(k, (int) min_keep);
k = std::min(k, (int) candidates->size);
// Sort scores in descending order
if (!candidates->sorted) {
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (k <= 128) {
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
} else {
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucker_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx(candidates->size);
std::vector<int> histo(nbuckets, 0);
for (int i = 0; i < (int)candidates->size; ++i) {
const float val = candidates->data[i].logit;
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets-1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= k) break;
}
std::vector<llama_token_data> tmp_tokens(nhave);
auto ptr = tmp_tokens.data();
std::vector<llama_token_data*> bucket_ptrs;
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)candidates->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
for (int j = nbuckets-1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
}
candidates->sorted = true;
}
candidates->size = k;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
if (p >= 1.0f) {
return;
}
llama_sample_softmax_impl(smpl, candidates);
const int64_t t_start_sample_us = ggml_time_us();
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
for (size_t i = 0; i < candidates->size; ++i) {
cum_sum += candidates->data[i].p;
// Check if the running sum is at least p or if we have kept at least min_keep tokens
// we set the last index to i+1 to indicate that the current iterate should be included in the set
if (cum_sum >= p && i + 1 >= min_keep) {
last_idx = i + 1;
break;
}
}
// Resize the output vector to keep only the top-p tokens
candidates->size = last_idx;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
if (p <= 0.0f || !candidates->size) {
return;
}
const int64_t t_start_sample_us = ggml_time_us();
bool min_p_applied = false;
// if the candidates aren't sorted, try the unsorted implementation first
if (!candidates->sorted) {
std::vector<llama_token_data> filtered_tokens;
float max_logit = -FLT_MAX;
for (size_t i = 0; i < candidates->size; ++i) {
max_logit = std::max(max_logit, candidates->data[i].logit);
}
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
for (size_t i = 0; i < candidates->size; ++i) {
if (candidates->data[i].logit >= min_logit) {
filtered_tokens.push_back(candidates->data[i]);
}
}
// if we have enough values the operation was a success
if (filtered_tokens.size() >= min_keep) {
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
candidates->size = filtered_tokens.size();
min_p_applied = true;
}
}
// if the candidates are sorted or the unsorted implementation failed, use this implementation
if (!min_p_applied) {
// Sort the logits in descending order
if (!candidates->sorted) {
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
candidates->sorted = true;
}
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
size_t i = 1; // first token always matches
for (; i < candidates->size; ++i) {
if (candidates->data[i].logit < min_logit && i >= min_keep) {
break; // prob too small
}
}
// Resize the output vector to keep only the matching tokens
candidates->size = i;
}
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
if (z >= 1.0f || candidates->size <= 2) {
return;
}
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
// Compute the first and second derivatives
std::vector<float> first_derivatives(candidates->size - 1);
std::vector<float> second_derivatives(candidates->size - 2);
for (size_t i = 0; i < first_derivatives.size(); ++i) {
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
}
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
}
// Calculate absolute value of second derivatives
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = std::abs(second_derivatives[i]);
}
// Normalize the second derivatives
{
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
}
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
for (size_t i = 0; i < second_derivatives.size(); ++i) {
cum_sum += second_derivatives[i];
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
if (cum_sum > z && i >= min_keep) {
last_idx = i;
break;
}
}
// Resize the output vector to keep only the tokens above the tail location
candidates->size = last_idx;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
// Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
if (p >= 1.0f) {
return;
}
// Compute the softmax of logits and calculate entropy
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
}
// Compute the absolute difference between negative log probability and entropy for each candidate
std::vector<float> shifted_scores;
for (size_t i = 0; i < candidates->size; ++i) {
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
shifted_scores.push_back(shifted_score);
}
// Sort tokens based on the shifted_scores and their corresponding indices
std::vector<size_t> indices(candidates->size);
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
return shifted_scores[a] < shifted_scores[b];
});
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = indices.size();
for (size_t i = 0; i < indices.size(); ++i) {
size_t idx = indices[i];
cum_sum += candidates->data[idx].p;
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
if (cum_sum > p && i >= min_keep - 1) {
last_idx = i + 1;
break;
}
}
// Resize the output vector to keep only the locally typical tokens
std::vector<llama_token_data> new_candidates;
for (size_t i = 0; i < last_idx; ++i) {
size_t idx = indices[i];
new_candidates.push_back(candidates->data[idx]);
}
// Replace the data in candidates with the new_candidates data
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
candidates->size = new_candidates.size();
candidates->sorted = false;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
const int64_t t_start_sample_us = ggml_time_us();
// no need to do anything if there is only one (or zero) candidates
if(candidates->size <= 1) {
return;
}
// Calculate maximum possible entropy
float max_entropy = -logf(1.0f / candidates->size);
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
// Calculate entropy of the softmax probabilities
float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
float prob = candidates->data[i].p;
if (prob > 0.0f) { // Ensure no log(0)
entropy -= prob * logf(prob);
}
}
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above)
float normalized_entropy = entropy / max_entropy;
// Map the normalized entropy to the desired temperature range using the power function
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
#ifdef DEBUG
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
#endif
// Apply the dynamically calculated temperature scaling
for (size_t i = 0; i < candidates->size; ++i) {
candidates->data[i].logit /= dyn_temp;
}
// Re-compute softmax probabilities after scaling logits with dynamic temperature
double max_l_double = candidates->data[0].logit;
double cum_sum_double = 0.0;
for (size_t i = 0; i < candidates->size; ++i) {
double p = exp(candidates->data[i].logit - max_l_double);
candidates->data[i].p = p; // Store the scaled probability
cum_sum_double += p;
}
for (size_t i = 0; i < candidates->size; ++i) {
candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities
}
#ifdef DEBUG
// Print the updated top 25 probabilities after temperature scaling
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
for (size_t i = 0; i < 25 && i < candidates->size; ++i) {
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f);
}
#endif
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
const int64_t t_start_sample_us = ggml_time_us();
for (size_t i = 0; i < candidates->size; ++i) {
candidates->data[i].logit /= temp;
}
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_repetition_penalties_impl(
struct llama_sampling * smpl,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present) {
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
return;
}
const int64_t t_start_sample_us = ggml_time_us();
// Create a frequency map to count occurrences of each token in last_tokens
std::unordered_map<llama_token, int> token_count;
for (size_t i = 0; i < penalty_last_n; ++i) {
token_count[last_tokens[i]]++;
}
// Apply frequency and presence penalties to the candidates
for (size_t i = 0; i < candidates->size; ++i) {
const auto token_iter = token_count.find(candidates->data[i].id);
if (token_iter == token_count.end()) {
continue;
}
const int count = token_iter->second;
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (candidates->data[i].logit <= 0) {
candidates->data[i].logit *= penalty_repeat;
} else {
candidates->data[i].logit /= penalty_repeat;
}
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
}
candidates->sorted = false;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_apply_guidance_impl(
struct llama_sampling * smpl,
float * logits,
float * logits_guidance,
float scale) {
GGML_ASSERT(smpl);
const auto t_start_sample_us = ggml_time_us();
const auto n_vocab = smpl->n_vocab;
llama_log_softmax(logits, n_vocab);
llama_log_softmax(logits_guidance, n_vocab);
for (int i = 0; i < n_vocab; ++i) {
auto & l = logits[i];
const auto & g = logits_guidance[i];
l = scale * (l - g) + g;
}
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
GGML_ASSERT(smpl);
const int32_t n_vocab = float(smpl->n_vocab);
int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
// Estimate s_hat using the most probable m tokens
float s_hat = 0.0;
float sum_ti_bi = 0.0;
float sum_ti_sq = 0.0;
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
float t_i = logf(float(i + 2) / float(i + 1));
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
sum_ti_bi += t_i * b_i;
sum_ti_sq += t_i * t_i;
}
s_hat = sum_ti_bi / sum_ti_sq;
// Compute k from the estimated s_hat and target surprise value
float epsilon_hat = s_hat - 1;
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
// Sample the next word X using top-k sampling
llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
llama_token X = llama_sample_token_impl(smpl, candidates);
t_start_sample_us = ggml_time_us();
// Compute error as the difference between observed surprise and target surprise value
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
return candidate.id == X;
}));
float observed_surprise = -log2f(candidates->data[X_idx].p);
float e = observed_surprise - tau;
// Update mu using the learning rate and error
*mu = *mu - eta * e;
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
return X;
}
llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
int64_t t_start_sample_us;
t_start_sample_us = ggml_time_us();
llama_sample_softmax_impl(smpl, candidates);
// Truncate the words with surprise values greater than mu
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
return -log2f(candidate.p) > *mu;
}));
if (candidates->size == 0) {
candidates->size = 1;
}
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
// Normalize the probabilities of the remaining words
llama_sample_softmax_impl(smpl, candidates);
// Sample the next word X from the remaining words
llama_token X = llama_sample_token_impl(smpl, candidates);
t_start_sample_us = ggml_time_us();
// Compute error as the difference between observed surprise and target surprise value
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
return candidate.id == X;
}));
float observed_surprise = -log2f(candidates->data[X_idx].p);
float e = observed_surprise - tau;
// Update mu using the learning rate and error
*mu = *mu - eta * e;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
}
return X;
}
llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
const int64_t t_start_sample_us = ggml_time_us();
// Find max element
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit < b.logit;
});
llama_token result = max_iter->id;
if (smpl) {
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
smpl->n_sample++;
}
return result;
}
llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
GGML_ASSERT(smpl);
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
std::vector<float> probs;
probs.reserve(candidates->size);
for (size_t i = 0; i < candidates->size; ++i) {
probs.push_back(candidates->data[i].p);
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
llama_token result = candidates->data[idx].id;
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
smpl->n_sample++;
return result;
}
llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);
}

56
src/llama-sampling.h Normal file
View file

@ -0,0 +1,56 @@
#pragma once
#include "llama-impl.h"
struct llama_sampling {
llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
std::mt19937 rng;
int32_t n_vocab = 0;
mutable int64_t t_sample_us = 0;
mutable int32_t n_sample = 0;
void reset_timings() const {
t_sample_us = 0;
n_sample = 0;
}
};
//
// internal API
//
void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
void llama_sample_softmax_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);
void llama_sample_top_k_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
void llama_sample_top_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
void llama_sample_min_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
void llama_sample_repetition_penalties_impl(
struct llama_sampling * smpl,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present);
void llama_sample_apply_guidance_impl(
struct llama_sampling * smpl,
float * logits,
float * logits_guidance,
float scale);
llama_token llama_sample_token_mirostat_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
llama_token llama_sample_token_greedy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);
llama_token llama_sample_token_with_rng_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
llama_token llama_sample_token_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);

1721
src/llama-vocab.cpp Normal file

File diff suppressed because it is too large Load diff

130
src/llama-vocab.h Normal file
View file

@ -0,0 +1,130 @@
#pragma once
#include "llama-impl.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
tattr attr;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
id special_cls_id = -1;
id special_mask_id = -1;
id linefeed_id = 13;
id special_prefix_id = -1;
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
};
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,12 @@
#include <locale> #include <locale>
#include <codecvt> #include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) { static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result; std::string result;
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {

View file

@ -4,6 +4,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
// TODO: prefix all symbols with "llama_"
struct codepoint_flags { struct codepoint_flags {
enum { enum {
UNDEFINED = 0x0001, UNDEFINED = 0x0001,
@ -46,6 +48,7 @@ struct codepoint_flags {
} }
}; };
size_t unicode_len_utf8(char src);
std::string unicode_cpt_to_utf8(uint32_t cp); std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);

View file

@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
target_link_libraries(test-tokenizer-0 PRIVATE common) target_link_libraries(test-tokenizer-0 PRIVATE common)
install(TARGETS test-tokenizer-0 RUNTIME) install(TARGETS test-tokenizer-0 RUNTIME)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
# TODO: enable when fixed
# https://github.com/ggerganov/llama.cpp/pull/7036
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-bpe target once and add many tests # build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
install(TARGETS test-tokenizer-1-bpe RUNTIME) install(TARGETS test-tokenizer-1-bpe RUNTIME)
# TODO: disabled due to slowness # TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
# build test-tokenizer-1-spm target once and add many tests # build test-tokenizer-1-spm target once and add many tests
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp) add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)

View file

@ -1,4 +1,3 @@
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream> #include <sstream>
@ -133,13 +132,31 @@ int main(void) {
); );
formatted_chat.resize(res); formatted_chat.resize(res);
std::string output(formatted_chat.data(), formatted_chat.size()); std::string output(formatted_chat.data(), formatted_chat.size());
std::cout << output << "\n-------------------------\n"; printf("%s\n", output.c_str());
printf("-------------------------\n");
assert(output == expected); assert(output == expected);
} }
// test llama_chat_format_single
std::cout << "\n\n=== llama_chat_format_single ===\n\n"; // test llama_chat_format_single for system message
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
std::vector<llama_chat_msg> chat2; std::vector<llama_chat_msg> chat2;
llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
auto fmt_sys = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output;
};
assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message
assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
// test llama_chat_format_single for user message
printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
chat2.push_back({"system", "You are a helpful assistant"}); chat2.push_back({"system", "You are a helpful assistant"});
chat2.push_back({"user", "Hello"}); chat2.push_back({"user", "Hello"});
chat2.push_back({"assistant", "I am assistant"}); chat2.push_back({"assistant", "I am assistant"});
@ -147,7 +164,8 @@ int main(void) {
auto fmt_single = [&](std::string tmpl) { auto fmt_single = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output; return output;
}; };
assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");

View file

@ -49,16 +49,21 @@ static bool match_string(const std::string & input, llama_grammar* grammar) {
const auto & code_points = decoded.first; const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks; const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) { llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
// no stacks means that the grammar failed to match at this point // no stacks means that the grammar failed to match at this point
return false; return false;
} }
} }
for (const auto & stack : grammar->stacks) { for (const auto & stack : cur_stacks) {
if (stack.empty()) { if (stack.empty()) {
// An empty stack means that the grammar has been completed // An empty stack means that the grammar has been completed
return true; return true;
@ -75,7 +80,9 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
auto grammar = build_grammar(grammar_str); auto grammar = build_grammar(grammar_str);
// Save the original grammar stacks so that we can reset after every new string we want to test // Save the original grammar stacks so that we can reset after every new string we want to test
auto original_stacks = grammar->stacks; const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
fprintf(stderr, " 🔵 Valid strings:\n"); fprintf(stderr, " 🔵 Valid strings:\n");
@ -112,7 +119,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
assert(matched); assert(matched);
// Reset the grammar stacks // Reset the grammar stacks
grammar->stacks = original_stacks; cur_stacks = original_stacks;
} }
fprintf(stderr, " 🟠 Invalid strings:\n"); fprintf(stderr, " 🟠 Invalid strings:\n");
@ -132,7 +139,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
assert(!matched); assert(!matched);
// Reset the grammar stacks // Reset the grammar stacks
grammar->stacks = original_stacks; cur_stacks = original_stacks;
} }
// Clean up allocated memory // Clean up allocated memory

View file

@ -2,10 +2,12 @@
#undef NDEBUG #undef NDEBUG
#endif #endif
#include "llama.cpp" // TODO: not great #define LLAMA_API_INTERNAL
#include "llama.h"
#include "grammar-parser.h" #include "grammar-parser.h"
#include <cassert> #include <cassert>
#include <stdexcept>
int main() int main()
{ {
@ -114,8 +116,8 @@ int main()
llama_grammar * grammar = NULL; llama_grammar * grammar = NULL;
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules()); std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) if (grammar == nullptr)
{ {
throw std::runtime_error("Failed to initialize llama_grammar"); throw std::runtime_error("Failed to initialize llama_grammar");
@ -172,7 +174,7 @@ int main()
}}; }};
auto index = 0; auto index = 0;
for (auto stack : grammar->stacks) for (auto stack : llama_grammar_get_stacks(grammar))
{ {
// compare stack to expected_stack // compare stack to expected_stack
for (uint32_t i = 0; i < stack.size(); i++) for (uint32_t i = 0; i < stack.size(); i++)
@ -374,13 +376,13 @@ int main()
}, },
}; };
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates); std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
std::vector<std::vector<llama_grammar_candidate>> all_rejects; std::vector<std::vector<llama_grammar_candidate>> all_rejects;
for (std::size_t count = 0; count < grammar->stacks.size(); ++count) for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
{ {
rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates); rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
all_rejects.push_back(rejects); all_rejects.push_back(rejects);
} }
@ -401,6 +403,6 @@ int main()
delete[] candidate.code_points; delete[] candidate.code_points;
candidate.code_points = nullptr; candidate.code_points = nullptr;
} }
delete grammar; llama_grammar_free(grammar);
return 0; return 0;
} }