Merge branch 'master' into master
This commit is contained in:
commit
3cd3964587
55 changed files with 3636 additions and 2119 deletions
1
.flake8
1
.flake8
|
@ -1,2 +1,3 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
|
ignore = W503
|
||||||
|
|
2
.github/workflows/python-lint.yml
vendored
2
.github/workflows/python-lint.yml
vendored
|
@ -16,5 +16,5 @@ jobs:
|
||||||
- name: flake8 Lint
|
- name: flake8 Lint
|
||||||
uses: py-actions/flake8@v2
|
uses: py-actions/flake8@v2
|
||||||
with:
|
with:
|
||||||
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
|
||||||
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
|
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||||
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
|
ifndef CUDA_DOCKER_ARCH
|
||||||
|
ifndef CUDA_POWER_ARCH
|
||||||
|
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
||||||
|
endif # CUDA_POWER_ARCH
|
||||||
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
|
|
|
@ -13,17 +13,31 @@ let package = Package(
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
dependencies: [
|
|
||||||
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
|
|
||||||
],
|
|
||||||
targets: [
|
targets: [
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
dependencies: ["ggml"],
|
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: ["ggml-metal.metal"],
|
exclude: [
|
||||||
|
"cmake",
|
||||||
|
"examples",
|
||||||
|
"scripts",
|
||||||
|
"models",
|
||||||
|
"tests",
|
||||||
|
"CMakeLists.txt",
|
||||||
|
"ggml-cuda.cu",
|
||||||
|
"ggml-cuda.h",
|
||||||
|
"Makefile"
|
||||||
|
],
|
||||||
sources: [
|
sources: [
|
||||||
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
|
"ggml-quants.c",
|
||||||
|
"ggml-metal.m",
|
||||||
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
],
|
],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
|
|
46
ci/run.sh
46
ci/run.sh
|
@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# bge-small
|
||||||
|
|
||||||
|
function gg_run_embd_bge_small {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
||||||
|
|
||||||
|
path_models="../models-mnt/bge-small"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_embd_bge_small {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'BGE Small (BERT):\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
}
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
|
||||||
test $ret -eq 0 && gg_run ctest_release
|
test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
|
|
|
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
const auto sampler_names = string_split(argv[i], ';');
|
||||||
|
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
|
||||||
} else if (arg == "--sampling-seq") {
|
} else if (arg == "--sampling-seq") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = argv[i];
|
sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
std::string sampler_type_chars;
|
||||||
|
std::string sampler_type_names;
|
||||||
|
for (const auto sampler_type : sparams.samplers_sequence) {
|
||||||
|
sampler_type_chars += static_cast<char>(sampler_type);
|
||||||
|
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
|
||||||
|
}
|
||||||
|
sampler_type_names.pop_back();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
|
||||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
|
@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input) {
|
std::vector<std::string> string_split(std::string input, char separator) {
|
||||||
std::string output = "";
|
std::vector<std::string> parts;
|
||||||
|
size_t separator_pos = input.find(separator);
|
||||||
|
while (separator_pos != std::string::npos) {
|
||||||
|
std::string part = input.substr(0, separator_pos);
|
||||||
|
parts.emplace_back(part);
|
||||||
|
input = input.substr(separator_pos + 1);
|
||||||
|
separator_pos = input.find(separator);
|
||||||
|
}
|
||||||
|
parts.emplace_back(input);
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, char> samplers_symbols {
|
std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
|
||||||
{"top_k", 'k'},
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
{"top-k", 'k'},
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
{"top_p", 'p'},
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
{"top-p", 'p'},
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
{"nucleus", 'p'},
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
{"typical_p", 'y'},
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical-p", 'y'},
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical", 'y'},
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
{"min_p", 'm'},
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
{"min-p", 'm'},
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
{"tfs_z", 'f'},
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs-z", 'f'},
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs", 'f'},
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
{"temp", 't'},
|
{"temp", llama_sampler_type::TEMP},
|
||||||
{"temperature",'t'}
|
{"temperature", llama_sampler_type::TEMP}
|
||||||
};
|
};
|
||||||
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
|
||||||
size_t separator = input.find(';');
|
|
||||||
while (separator != input.npos) {
|
|
||||||
std::string name = input.substr(0,separator);
|
|
||||||
input = input.substr(separator+1);
|
|
||||||
separator = input.find(';');
|
|
||||||
|
|
||||||
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
output += samplers_symbols[name];
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto& name : names) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
return sampler_types;
|
||||||
output += samplers_symbols[input];
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMP}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMP: return "temp";
|
||||||
|
default : return "";
|
||||||
}
|
}
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1550,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
fprintf(stream, "debug: false\n");
|
fprintf(stream, "debug: false\n");
|
||||||
|
|
|
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input);
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
||||||
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
|
|
@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
std::string result = "CFG -> Penalties ";
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (auto s : params.samplers_sequence) {
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
switch (s) {
|
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
||||||
case 'k': result += "-> top_k "; break;
|
if (!sampler_type_name.empty()) {
|
||||||
case 'f': result += "-> tfs_z "; break;
|
result += "-> " + sampler_type_name + " ";
|
||||||
case 'y': result += "-> typical_p "; break;
|
|
||||||
case 'p': result += "-> top_p "; break;
|
|
||||||
case 'm': result += "-> min_p "; break;
|
|
||||||
case 't': result += "-> temp "; break;
|
|
||||||
default : break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -127,8 +122,6 @@ static void sampler_queue(
|
||||||
const llama_sampling_params & params,
|
const llama_sampling_params & params,
|
||||||
llama_token_data_array & cur_p,
|
llama_token_data_array & cur_p,
|
||||||
size_t & min_keep) {
|
size_t & min_keep) {
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
|
@ -137,16 +130,16 @@ static void sampler_queue(
|
||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const std::string & samplers_sequence = params.samplers_sequence;
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
for (auto s : samplers_sequence) {
|
for (auto sampler_type : samplers_sequence) {
|
||||||
switch (s){
|
switch (sampler_type) {
|
||||||
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||||
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
case 't':
|
case llama_sampler_type::TEMP:
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
|
|
|
@ -8,6 +8,16 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampler types
|
||||||
|
enum class llama_sampler_type : char {
|
||||||
|
TOP_K = 'k',
|
||||||
|
TOP_P = 'p',
|
||||||
|
MIN_P = 'm',
|
||||||
|
TFS_Z = 'f',
|
||||||
|
TYPICAL_P = 'y',
|
||||||
|
TEMP = 't'
|
||||||
|
};
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
typedef struct llama_sampling_params {
|
typedef struct llama_sampling_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
@ -28,7 +38,15 @@ typedef struct llama_sampling_params {
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
|
||||||
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
|
llama_sampler_type::TOP_K,
|
||||||
|
llama_sampler_type::TFS_Z,
|
||||||
|
llama_sampler_type::TYPICAL_P,
|
||||||
|
llama_sampler_type::TOP_P,
|
||||||
|
llama_sampler_type::MIN_P,
|
||||||
|
llama_sampler_type::TEMP
|
||||||
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -25,15 +25,6 @@ import gguf
|
||||||
from convert import HfVocab
|
from convert import HfVocab
|
||||||
|
|
||||||
|
|
||||||
# check for any of the given keys in the dictionary and return the value of the first key found
|
|
||||||
def get_key_opts(d, keys):
|
|
||||||
for k in keys:
|
|
||||||
if k in d:
|
|
||||||
return d[k]
|
|
||||||
print(f"Could not find any of {keys}")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
###### MODEL DEFINITIONS ######
|
###### MODEL DEFINITIONS ######
|
||||||
|
|
||||||
class SentencePieceTokenTypes(IntEnum):
|
class SentencePieceTokenTypes(IntEnum):
|
||||||
|
@ -58,6 +49,15 @@ class Model:
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.model_arch = self._get_model_architecture()
|
self.model_arch = self._get_model_architecture()
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
||||||
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
|
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
|
||||||
|
key = next((k for k in keys if k in self.hparams), None)
|
||||||
|
if key is not None:
|
||||||
|
return self.hparams[key]
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
raise KeyError(f"could not find any of: {keys}")
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
|
@ -79,28 +79,33 @@ class Model:
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
self.gguf_writer.add_block_count(self.hparams.get(
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
|
|
||||||
))
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||||
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
|
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
if (n_embd := self.hparams.get("hidden_size")) is not None:
|
|
||||||
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
self.gguf_writer.add_embedding_length(n_embd)
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
|
||||||
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||||
if (n_head := self.hparams.get("num_attention_heads")) is not None:
|
|
||||||
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
self.gguf_writer.add_head_count(n_head)
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
|
||||||
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
|
|
||||||
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||||
|
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
|
||||||
|
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||||
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||||
self.gguf_writer.add_expert_count(n_experts)
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
@ -209,6 +214,10 @@ class Model:
|
||||||
return InternLM2Model
|
return InternLM2Model
|
||||||
if model_architecture == "MiniCPMForCausalLM":
|
if model_architecture == "MiniCPMForCausalLM":
|
||||||
return MiniCPMModel
|
return MiniCPMModel
|
||||||
|
if model_architecture == "BertModel":
|
||||||
|
return BertModel
|
||||||
|
if model_architecture == "NomicBertModel":
|
||||||
|
return NomicBertModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -264,6 +273,10 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.INTERNLM2
|
return gguf.MODEL_ARCH.INTERNLM2
|
||||||
if arch == "MiniCPMForCausalLM":
|
if arch == "MiniCPMForCausalLM":
|
||||||
return gguf.MODEL_ARCH.MINICPM
|
return gguf.MODEL_ARCH.MINICPM
|
||||||
|
if arch == "BertModel":
|
||||||
|
return gguf.MODEL_ARCH.BERT
|
||||||
|
if arch == "NomicBertModel":
|
||||||
|
return gguf.MODEL_ARCH.NOMIC_BERT
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -1293,21 +1306,21 @@ class GPT2Model(Model):
|
||||||
|
|
||||||
class Phi2Model(Model):
|
class Phi2Model(Model):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
|
rot_pct = self.find_hparam(["partial_rotary_factor"])
|
||||||
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
|
|
||||||
self.gguf_writer.add_name("Phi2")
|
self.gguf_writer.add_name("Phi2")
|
||||||
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||||
|
|
||||||
self.gguf_writer.add_embedding_length(n_embd)
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_head_count(n_head)
|
self.gguf_writer.add_head_count(n_head)
|
||||||
self.gguf_writer.add_head_count_kv(n_head)
|
self.gguf_writer.add_head_count_kv(n_head)
|
||||||
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
@ -1629,6 +1642,127 @@ in chat mode so that the conversation can end normally.")
|
||||||
self.post_write_tensors(tensor_map, name, data_torch)
|
self.post_write_tensors(tensor_map, name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
|
class BertModel(Model):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.vocab_size = None
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_causal_attention(False)
|
||||||
|
self.gguf_writer.add_pooling_layer(True)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
path = self.dir_model
|
||||||
|
added_tokens_path = self.dir_model if self.dir_model.exists() else None
|
||||||
|
|
||||||
|
# use huggingface vocab to get all tokens
|
||||||
|
vocab = HfVocab(path, added_tokens_path)
|
||||||
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
self.vocab_size = vocab.vocab_size
|
||||||
|
|
||||||
|
# we need this to validate the size of the token_type embeddings
|
||||||
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
|
n_token_types = len(set(toktypes))
|
||||||
|
self.gguf_writer.add_token_type_count(n_token_types)
|
||||||
|
|
||||||
|
# convert to phantom space vocab
|
||||||
|
def phantom(tok, typ):
|
||||||
|
if tok.startswith(b"[") and tok.endswith(b"]"):
|
||||||
|
return tok
|
||||||
|
if tok.startswith(b"##"):
|
||||||
|
return tok[2:]
|
||||||
|
return b"\xe2\x96\x81" + tok
|
||||||
|
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
||||||
|
|
||||||
|
# set up bos and eos tokens (cls and sep)
|
||||||
|
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
||||||
|
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
||||||
|
|
||||||
|
# add vocab to gguf
|
||||||
|
self.gguf_writer.add_tokenizer_model("bert")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
# handle special tokens
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
tensors = dict(self.get_tensors())
|
||||||
|
for name, data_torch in tensors.items():
|
||||||
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
||||||
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
||||||
|
continue # we don't need these
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
new_dtype: type[np.floating[Any]]
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.ftype == 1 and name.endswith(".weight") and n_dims == 2
|
||||||
|
and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
|
||||||
|
):
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
new_dtype = np.float16
|
||||||
|
else:
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
new_dtype = np.float32
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
|
||||||
|
|
||||||
|
if data.dtype != new_dtype:
|
||||||
|
data = data.astype(new_dtype)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class NomicBertModel(BertModel):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||||
|
self.hparams["n_ctx"] = 2048
|
||||||
|
|
||||||
|
# SwigLU activation
|
||||||
|
assert self.hparams["activation_function"] == "swiglu"
|
||||||
|
# this doesn't do anything in the HF version
|
||||||
|
assert self.hparams["causal"] is False
|
||||||
|
# no bias tensors
|
||||||
|
assert self.hparams["qkv_proj_bias"] is False
|
||||||
|
assert self.hparams["mlp_fc1_bias"] is False
|
||||||
|
assert self.hparams["mlp_fc2_bias"] is False
|
||||||
|
# norm at end of layer
|
||||||
|
assert self.hparams["prenorm"] is False
|
||||||
|
# standard RoPE
|
||||||
|
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||||
|
assert self.hparams["rotary_emb_interleaved"] is False
|
||||||
|
assert self.hparams["rotary_emb_scale_base"] is None
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
|
||||||
|
def get_tensors(self):
|
||||||
|
assert self.vocab_size is not None
|
||||||
|
for name, data in super().get_tensors():
|
||||||
|
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
||||||
|
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
||||||
|
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
||||||
|
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
||||||
|
data = data[:self.vocab_size, :]
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -88,7 +88,8 @@ def main():
|
||||||
gguf_writer.add_embedding_length(hidden_size)
|
gguf_writer.add_embedding_length(hidden_size)
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
||||||
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
|
||||||
|
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
|
||||||
gguf_writer.add_head_count(head_count)
|
gguf_writer.add_head_count(head_count)
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
||||||
|
|
|
@ -38,6 +38,7 @@ else()
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
|
add_subdirectory(gguf)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(imatrix)
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
|
|
|
@ -7,6 +7,51 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static std::vector<std::string> split_lines(const std::string & s) {
|
||||||
|
std::string line;
|
||||||
|
std::vector<std::string> lines;
|
||||||
|
std::stringstream ss(s);
|
||||||
|
while (std::getline(ss, line)) {
|
||||||
|
lines.push_back(line);
|
||||||
|
}
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
||||||
|
for (size_t i = 0; i < tokens.size(); i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void normalize(float * vec, float * out, int n) {
|
||||||
|
float norm = 0;
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
norm += vec[i] * vec[i];
|
||||||
|
}
|
||||||
|
norm = sqrt(norm);
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
out[i] = vec[i] / norm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||||
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
// run model
|
||||||
|
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
|
fprintf(stderr, "%s : failed to decode\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize on copy
|
||||||
|
for (int k = 0; k < n_seq; k++) {
|
||||||
|
float * emb = llama_get_embeddings_ith(ctx, k);
|
||||||
|
float * out = output + k * n_embd;
|
||||||
|
normalize(emb, out, n_embd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
@ -55,49 +100,84 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_past = 0;
|
// split the prompt into lines
|
||||||
|
std::vector<std::string> prompts = split_lines(params.prompt);
|
||||||
|
|
||||||
// tokenize the prompt
|
// max batch size
|
||||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
GGML_ASSERT(params.n_batch == params.n_ctx);
|
||||||
|
|
||||||
|
// tokenize the prompts and trim
|
||||||
|
std::vector<std::vector<int32_t>> inputs;
|
||||||
|
for (const auto & prompt : prompts) {
|
||||||
|
auto inp = ::llama_tokenize(ctx, prompt, true);
|
||||||
|
if (inp.size() > n_batch) {
|
||||||
|
inp.resize(n_batch);
|
||||||
|
}
|
||||||
|
inputs.push_back(inp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
fprintf(stderr, "\n");
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (embd_inp.size() > (size_t)n_ctx) {
|
// initialize batch
|
||||||
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
|
const int n_prompts = prompts.size();
|
||||||
__func__, embd_inp.size(), n_ctx);
|
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!embd_inp.empty()) {
|
|
||||||
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
n_past += n_tokens;
|
|
||||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// allocate output
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
const auto * embeddings = llama_get_embeddings(ctx);
|
std::vector<float> embeddings(n_prompts * n_embd, 0);
|
||||||
|
float * emb = embeddings.data();
|
||||||
|
|
||||||
for (int i = 0; i < n_embd; i++) {
|
// break into batches
|
||||||
printf("%f ", embeddings[i]);
|
int p = 0; // number of prompts processed already
|
||||||
|
int s = 0; // number of prompts in current batch
|
||||||
|
for (int k = 0; k < n_prompts; k++) {
|
||||||
|
// clamp to n_batch tokens
|
||||||
|
auto & inp = inputs[k];
|
||||||
|
const uint64_t n_toks = inp.size();
|
||||||
|
|
||||||
|
// encode if at capacity
|
||||||
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
|
float * out = emb + p * n_embd;
|
||||||
|
batch_decode(ctx, batch, out, s, n_embd);
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
p += s;
|
||||||
|
s = 0;
|
||||||
}
|
}
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
|
// add to batch
|
||||||
|
batch_add_seq(batch, inp, s);
|
||||||
|
s += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// final batch
|
||||||
|
float * out = emb + p * n_embd;
|
||||||
|
batch_decode(ctx, batch, out, s, n_embd);
|
||||||
|
|
||||||
|
// print first 3 embeddings
|
||||||
|
for (int j = 0; j < std::min(3, n_prompts); j++) {
|
||||||
|
fprintf(stderr, "embedding %d: ", j);
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
fprintf(stderr, "%f ", emb[j * n_embd + i]);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
// clean up
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -337,24 +337,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
|
||||||
params.mem_buffer = NULL;
|
params.mem_buffer = NULL;
|
||||||
params.no_alloc = true;
|
params.no_alloc = true;
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
struct ggml_allocr * alloc = NULL;
|
struct ggml_gallocr * alloc = NULL;
|
||||||
struct ggml_cgraph * gf = NULL;
|
struct ggml_cgraph * gf = NULL;
|
||||||
|
|
||||||
ctx = ggml_init(params);
|
ctx = ggml_init(params);
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
||||||
size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
ggml_free(ctx);
|
|
||||||
|
|
||||||
static std::vector<uint8_t> data_compute;
|
ggml_gallocr_alloc_graph(alloc, gf);
|
||||||
data_compute.resize(alloc_size + tensor_alignment);
|
|
||||||
|
|
||||||
ctx = ggml_init(params);
|
|
||||||
alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
|
|
||||||
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
|
||||||
ggml_allocr_alloc_graph(alloc, gf);
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
|
||||||
static std::vector<uint8_t> data_work;
|
static std::vector<uint8_t> data_work;
|
||||||
|
@ -363,6 +353,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
|
||||||
|
|
||||||
ggml_graph_compute(gf, &cplan);
|
ggml_graph_compute(gf, &cplan);
|
||||||
|
|
||||||
|
ggml_gallocr_free(alloc);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
|
||||||
--rank-wk N LORA rank for wk tensor (default 4)
|
--rank-wk N LORA rank for wk tensor (default 4)
|
||||||
--rank-wv N LORA rank for wv tensor (default 4)
|
--rank-wv N LORA rank for wv tensor (default 4)
|
||||||
--rank-wo N LORA rank for wo tensor (default 4)
|
--rank-wo N LORA rank for wo tensor (default 4)
|
||||||
--rank-w1 N LORA rank for w1 tensor (default 4)
|
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
|
||||||
--rank-w2 N LORA rank for w2 tensor (default 4)
|
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
|
||||||
--rank-w3 N LORA rank for w3 tensor (default 4)
|
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
|
||||||
```
|
```
|
||||||
|
|
||||||
The LORA rank of 'norm' tensors should always be 1.
|
The LORA rank of 'norm' tensors should always be 1.
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "train.h"
|
#include "train.h"
|
||||||
|
@ -13,8 +14,6 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static const size_t tensor_alignment = 32;
|
|
||||||
|
|
||||||
struct my_llama_hparams {
|
struct my_llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512;
|
uint32_t n_ctx = 512;
|
||||||
|
@ -61,9 +60,9 @@ struct my_llama_layer {
|
||||||
struct ggml_tensor * ffn_norm;
|
struct ggml_tensor * ffn_norm;
|
||||||
|
|
||||||
// ff
|
// ff
|
||||||
struct ggml_tensor * w1;
|
struct ggml_tensor * ffn_gate; // w1
|
||||||
struct ggml_tensor * w2;
|
struct ggml_tensor * ffn_down; // w2
|
||||||
struct ggml_tensor * w3;
|
struct ggml_tensor * ffn_up; // w3
|
||||||
};
|
};
|
||||||
|
|
||||||
struct my_llama_model {
|
struct my_llama_model {
|
||||||
|
@ -86,9 +85,9 @@ struct my_llama_lora_hparams {
|
||||||
uint32_t n_rank_wv = 4;
|
uint32_t n_rank_wv = 4;
|
||||||
uint32_t n_rank_wo = 4;
|
uint32_t n_rank_wo = 4;
|
||||||
uint32_t n_rank_ffn_norm = 1;
|
uint32_t n_rank_ffn_norm = 1;
|
||||||
uint32_t n_rank_w1 = 4;
|
uint32_t n_rank_ffn_gate = 4;
|
||||||
uint32_t n_rank_w2 = 4;
|
uint32_t n_rank_ffn_down = 4;
|
||||||
uint32_t n_rank_w3 = 4;
|
uint32_t n_rank_ffn_up = 4;
|
||||||
uint32_t n_rank_tok_embeddings = 4;
|
uint32_t n_rank_tok_embeddings = 4;
|
||||||
uint32_t n_rank_norm = 1;
|
uint32_t n_rank_norm = 1;
|
||||||
uint32_t n_rank_output = 4;
|
uint32_t n_rank_output = 4;
|
||||||
|
@ -118,17 +117,17 @@ struct my_llama_lora_layer {
|
||||||
struct ggml_tensor * ffn_norm_b;
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
|
||||||
// ff
|
// ff
|
||||||
struct ggml_tensor * w1_a;
|
struct ggml_tensor * ffn_gate_a;
|
||||||
struct ggml_tensor * w1_b;
|
struct ggml_tensor * ffn_gate_b;
|
||||||
struct ggml_tensor * w2_a;
|
struct ggml_tensor * ffn_down_a;
|
||||||
struct ggml_tensor * w2_b;
|
struct ggml_tensor * ffn_down_b;
|
||||||
struct ggml_tensor * w3_a;
|
struct ggml_tensor * ffn_up_a;
|
||||||
struct ggml_tensor * w3_b;
|
struct ggml_tensor * ffn_up_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct my_llama_lora {
|
struct my_llama_lora {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
std::vector<uint8_t> data;
|
ggml_backend_buffer_t data;
|
||||||
|
|
||||||
my_llama_lora_hparams hparams;
|
my_llama_lora_hparams hparams;
|
||||||
|
|
||||||
|
@ -209,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
|
||||||
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
|
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
|
||||||
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
|
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
|
||||||
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
|
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
|
||||||
printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1);
|
printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate);
|
||||||
printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2);
|
printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down);
|
||||||
printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3);
|
printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up);
|
||||||
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
|
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
|
||||||
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
|
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
|
||||||
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
|
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
|
||||||
|
@ -320,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||||
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
|
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
|
||||||
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
|
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
|
||||||
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
|
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
|
||||||
layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
|
layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
|
||||||
layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
|
layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||||
layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
|
layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
|
||||||
|
|
||||||
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
||||||
|
@ -330,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||||
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
|
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
|
||||||
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
||||||
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff);
|
||||||
assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd);
|
assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd);
|
||||||
assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff);
|
assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -363,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
|
||||||
ggml_set_param(ctx, layer.wo_b);
|
ggml_set_param(ctx, layer.wo_b);
|
||||||
ggml_set_param(ctx, layer.ffn_norm_a);
|
ggml_set_param(ctx, layer.ffn_norm_a);
|
||||||
ggml_set_param(ctx, layer.ffn_norm_b);
|
ggml_set_param(ctx, layer.ffn_norm_b);
|
||||||
ggml_set_param(ctx, layer.w1_a);
|
ggml_set_param(ctx, layer.ffn_gate_a);
|
||||||
ggml_set_param(ctx, layer.w1_b);
|
ggml_set_param(ctx, layer.ffn_gate_b);
|
||||||
ggml_set_param(ctx, layer.w2_a);
|
ggml_set_param(ctx, layer.ffn_down_a);
|
||||||
ggml_set_param(ctx, layer.w2_b);
|
ggml_set_param(ctx, layer.ffn_down_b);
|
||||||
ggml_set_param(ctx, layer.w3_a);
|
ggml_set_param(ctx, layer.ffn_up_a);
|
||||||
ggml_set_param(ctx, layer.w3_b);
|
ggml_set_param(ctx, layer.ffn_up_b);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
|
|
||||||
ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
|
|
||||||
ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
|
|
||||||
ggml_allocr_alloc(alloc, lora->norm_a);
|
|
||||||
ggml_allocr_alloc(alloc, lora->norm_b);
|
|
||||||
ggml_allocr_alloc(alloc, lora->output_a);
|
|
||||||
ggml_allocr_alloc(alloc, lora->output_b);
|
|
||||||
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
|
|
||||||
auto & layer = lora->layers[i];
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2_b);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3_a);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3_b);
|
|
||||||
}
|
|
||||||
ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, lora->norm_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, lora->norm_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, lora->output_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, lora->output_b->grad);
|
|
||||||
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
|
|
||||||
auto & layer = lora->layers[i];
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2_b->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3_a->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3_b->grad);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -493,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
|
||||||
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
|
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
|
||||||
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
|
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
|
||||||
|
|
||||||
layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
|
layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
|
||||||
layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
|
layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
|
||||||
layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
|
layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
|
||||||
layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
|
layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
|
||||||
layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
|
layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd);
|
||||||
layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
|
layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff);
|
||||||
|
|
||||||
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
|
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
|
||||||
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
|
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
|
||||||
|
@ -512,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
|
||||||
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
|
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
|
||||||
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
|
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
|
||||||
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
|
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
|
||||||
ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
|
ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
|
||||||
ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
|
ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
|
||||||
ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
|
ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
|
||||||
ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
|
ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
|
||||||
ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
|
ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
|
||||||
ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
|
ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
|
||||||
}
|
}
|
||||||
|
|
||||||
set_param_lora(lora);
|
set_param_lora(lora);
|
||||||
|
|
||||||
// measure data size
|
// allocate data for lora tensors
|
||||||
size_t size = 0;
|
lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
||||||
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
|
|
||||||
}
|
|
||||||
|
|
||||||
// allocate data
|
|
||||||
struct ggml_allocr * alloc = NULL;
|
|
||||||
lora->data.resize(size + tensor_alignment);
|
|
||||||
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
|
|
||||||
alloc_lora(alloc, lora);
|
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
|
static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
|
||||||
|
@ -565,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
||||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||||
ggml_set_zero(layer.ffn_norm_b);
|
ggml_set_zero(layer.ffn_norm_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1_a, rnd);
|
randomize_tensor_normal(layer.ffn_gate_a, rnd);
|
||||||
ggml_set_zero(layer.w1_b);
|
ggml_set_zero(layer.ffn_gate_b);
|
||||||
randomize_tensor_normal(layer.w2_a, rnd);
|
randomize_tensor_normal(layer.ffn_down_a, rnd);
|
||||||
ggml_set_zero(layer.w2_b);
|
ggml_set_zero(layer.ffn_down_b);
|
||||||
randomize_tensor_normal(layer.w3_a, rnd);
|
randomize_tensor_normal(layer.ffn_up_a, rnd);
|
||||||
ggml_set_zero(layer.w3_b);
|
ggml_set_zero(layer.ffn_up_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_random_normal_distribution(rnd);
|
free_random_normal_distribution(rnd);
|
||||||
|
@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
||||||
static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
struct my_llama_model * model,
|
struct my_llama_model * model,
|
||||||
struct my_llama_lora * lora,
|
struct my_llama_lora * lora,
|
||||||
struct ggml_allocr * alloc,
|
ggml_gallocr_t alloc,
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
|
@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_batch,
|
const int n_batch,
|
||||||
const bool enable_flash_attn,
|
const bool enable_flash_attn,
|
||||||
const bool enable_checkpointing) {
|
const bool enable_checkpointing,
|
||||||
|
const bool measure_only) {
|
||||||
|
|
||||||
ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
||||||
const int n_past = 0;
|
const int n_past = 0;
|
||||||
|
@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
|
|
||||||
// KQ_pos - contains the positions
|
// KQ_pos - contains the positions
|
||||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
|
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
|
||||||
ggml_allocr_alloc(alloc, KQ_pos);
|
ggml_set_input(KQ_pos);
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
int * data = (int *) KQ_pos->data;
|
|
||||||
for (int i = 0; i < N; ++i) {
|
|
||||||
data[i] = n_past + i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// rope has so much parameters that we make a custom function for it
|
// rope has so much parameters that we make a custom function for it
|
||||||
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||||
|
@ -687,9 +614,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
|
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
|
||||||
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
|
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
|
||||||
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
|
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
|
||||||
struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
|
struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
|
||||||
struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
|
struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
|
||||||
struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
|
struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
|
||||||
|
|
||||||
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
|
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
|
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
|
||||||
|
@ -732,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
||||||
cur = t30;
|
cur = t30;
|
||||||
if (enable_checkpointing) {
|
if (enable_checkpointing) {
|
||||||
|
@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
// input gradient
|
// input gradient
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
|
||||||
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
||||||
ggml_allocr_alloc(alloc, t36->grad);
|
ggml_set_input(t36->grad);
|
||||||
// KQ_pos
|
// KQ_pos
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
||||||
|
|
||||||
|
@ -796,20 +723,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocating checkpoints in one block to reduce memory fragmentation
|
// allocating checkpoints in one block to reduce memory fragmentation
|
||||||
// note: they will be freed in reverse order
|
// note: they will be freed in reverse order
|
||||||
for (unsigned int i = 0; i < checkpoints.size(); ++i) {
|
for (unsigned int i = 0; i < checkpoints.size(); ++i) {
|
||||||
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
|
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
|
||||||
ggml_allocr_alloc(alloc, checkpoints[i]);
|
ggml_set_input(checkpoints[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_allocr_alloc_graph(alloc, gb);
|
if (measure_only) {
|
||||||
|
ggml_gallocr_reserve(alloc, gb);
|
||||||
|
} else {
|
||||||
|
ggml_gallocr_alloc_graph(alloc, gb);
|
||||||
|
|
||||||
|
// set KQ_pos
|
||||||
|
{
|
||||||
|
int * data = (int *) KQ_pos->data;
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
data[i] = n_past + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// remove the additional nodes and leafs
|
// remove the additional nodes and leafs
|
||||||
for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
|
for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
|
||||||
|
@ -859,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
|
||||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
|
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
|
||||||
|
|
||||||
init_lora(model, lora);
|
init_lora(model, lora);
|
||||||
|
|
||||||
|
@ -886,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
|
||||||
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
|
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
|
||||||
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
|
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
|
||||||
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
|
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
|
||||||
copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a));
|
copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
|
||||||
copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b));
|
copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
|
||||||
copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a));
|
copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
|
||||||
copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b));
|
copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
|
||||||
copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a));
|
copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
|
||||||
copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b));
|
copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -929,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up);
|
||||||
|
|
||||||
gguf_add_tensor(fctx, lora->tok_embeddings_a);
|
gguf_add_tensor(fctx, lora->tok_embeddings_a);
|
||||||
gguf_add_tensor(fctx, lora->tok_embeddings_b);
|
gguf_add_tensor(fctx, lora->tok_embeddings_b);
|
||||||
|
@ -955,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
|
||||||
gguf_add_tensor(fctx, layer.wo_b);
|
gguf_add_tensor(fctx, layer.wo_b);
|
||||||
gguf_add_tensor(fctx, layer.ffn_norm_a);
|
gguf_add_tensor(fctx, layer.ffn_norm_a);
|
||||||
gguf_add_tensor(fctx, layer.ffn_norm_b);
|
gguf_add_tensor(fctx, layer.ffn_norm_b);
|
||||||
gguf_add_tensor(fctx, layer.w1_a);
|
gguf_add_tensor(fctx, layer.ffn_gate_a);
|
||||||
gguf_add_tensor(fctx, layer.w1_b);
|
gguf_add_tensor(fctx, layer.ffn_gate_b);
|
||||||
gguf_add_tensor(fctx, layer.w2_a);
|
gguf_add_tensor(fctx, layer.ffn_down_a);
|
||||||
gguf_add_tensor(fctx, layer.w2_b);
|
gguf_add_tensor(fctx, layer.ffn_down_b);
|
||||||
gguf_add_tensor(fctx, layer.w3_a);
|
gguf_add_tensor(fctx, layer.ffn_up_a);
|
||||||
gguf_add_tensor(fctx, layer.w3_b);
|
gguf_add_tensor(fctx, layer.ffn_up_b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1165,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
|
||||||
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
|
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
|
||||||
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
|
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
|
||||||
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
|
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
|
||||||
write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
|
write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
|
||||||
write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
|
write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
|
||||||
write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
|
write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
|
||||||
write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
|
write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
|
||||||
write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
|
write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
|
||||||
write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
|
write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1200,9 +1139,9 @@ struct train_params {
|
||||||
uint32_t n_rank_wv;
|
uint32_t n_rank_wv;
|
||||||
uint32_t n_rank_wo;
|
uint32_t n_rank_wo;
|
||||||
uint32_t n_rank_ffn_norm;
|
uint32_t n_rank_ffn_norm;
|
||||||
uint32_t n_rank_w1;
|
uint32_t n_rank_ffn_gate;
|
||||||
uint32_t n_rank_w2;
|
uint32_t n_rank_ffn_down;
|
||||||
uint32_t n_rank_w3;
|
uint32_t n_rank_ffn_up;
|
||||||
uint32_t n_rank_tok_embeddings;
|
uint32_t n_rank_tok_embeddings;
|
||||||
uint32_t n_rank_norm;
|
uint32_t n_rank_norm;
|
||||||
uint32_t n_rank_output;
|
uint32_t n_rank_output;
|
||||||
|
@ -1213,9 +1152,9 @@ struct train_params {
|
||||||
bool custom_n_rank_wv;
|
bool custom_n_rank_wv;
|
||||||
bool custom_n_rank_wo;
|
bool custom_n_rank_wo;
|
||||||
bool custom_n_rank_ffn_norm;
|
bool custom_n_rank_ffn_norm;
|
||||||
bool custom_n_rank_w1;
|
bool custom_n_rank_ffn_gate;
|
||||||
bool custom_n_rank_w2;
|
bool custom_n_rank_ffn_down;
|
||||||
bool custom_n_rank_w3;
|
bool custom_n_rank_ffn_up;
|
||||||
bool custom_n_rank_tok_embeddings;
|
bool custom_n_rank_tok_embeddings;
|
||||||
bool custom_n_rank_norm;
|
bool custom_n_rank_norm;
|
||||||
bool custom_n_rank_output;
|
bool custom_n_rank_output;
|
||||||
|
@ -1247,9 +1186,9 @@ static struct train_params get_default_train_params() {
|
||||||
params.n_rank_wv = 4;
|
params.n_rank_wv = 4;
|
||||||
params.n_rank_wo = 4;
|
params.n_rank_wo = 4;
|
||||||
params.n_rank_ffn_norm = 1;
|
params.n_rank_ffn_norm = 1;
|
||||||
params.n_rank_w1 = 4;
|
params.n_rank_ffn_gate = 4;
|
||||||
params.n_rank_w2 = 4;
|
params.n_rank_ffn_down = 4;
|
||||||
params.n_rank_w3 = 4;
|
params.n_rank_ffn_up = 4;
|
||||||
params.n_rank_tok_embeddings = 4;
|
params.n_rank_tok_embeddings = 4;
|
||||||
params.n_rank_norm = 1;
|
params.n_rank_norm = 1;
|
||||||
params.n_rank_output = 4;
|
params.n_rank_output = 4;
|
||||||
|
@ -1260,9 +1199,9 @@ static struct train_params get_default_train_params() {
|
||||||
params.custom_n_rank_wv = false;
|
params.custom_n_rank_wv = false;
|
||||||
params.custom_n_rank_wo = false;
|
params.custom_n_rank_wo = false;
|
||||||
params.custom_n_rank_ffn_norm = false;
|
params.custom_n_rank_ffn_norm = false;
|
||||||
params.custom_n_rank_w1 = false;
|
params.custom_n_rank_ffn_gate = false;
|
||||||
params.custom_n_rank_w2 = false;
|
params.custom_n_rank_ffn_down = false;
|
||||||
params.custom_n_rank_w3 = false;
|
params.custom_n_rank_ffn_up = false;
|
||||||
params.custom_n_rank_tok_embeddings = false;
|
params.custom_n_rank_tok_embeddings = false;
|
||||||
params.custom_n_rank_norm = false;
|
params.custom_n_rank_norm = false;
|
||||||
params.custom_n_rank_output = false;
|
params.custom_n_rank_output = false;
|
||||||
|
@ -1293,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
|
||||||
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
|
||||||
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
|
||||||
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
|
||||||
fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n");
|
||||||
fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n");
|
||||||
fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n");
|
fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n");
|
||||||
|
|
||||||
print_common_train_usage(argc, argv, ¶ms->common);
|
print_common_train_usage(argc, argv, ¶ms->common);
|
||||||
}
|
}
|
||||||
|
@ -1430,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
||||||
}
|
}
|
||||||
params->n_rank_wo = std::stoi(argv[i]);
|
params->n_rank_wo = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_wo = true;
|
params->custom_n_rank_wo = true;
|
||||||
} else if (arg == "--rank-w1") {
|
} else if (arg == "--rank-ffn_gate") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params->n_rank_w1 = std::stoi(argv[i]);
|
params->n_rank_ffn_gate = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_w1 = true;
|
params->custom_n_rank_ffn_gate = true;
|
||||||
} else if (arg == "--rank-w2") {
|
} else if (arg == "--rank-ffn_down") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params->n_rank_w2 = std::stoi(argv[i]);
|
params->n_rank_ffn_down = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_w2 = true;
|
params->custom_n_rank_ffn_down = true;
|
||||||
} else if (arg == "--rank-w3") {
|
} else if (arg == "--rank-ffn_up") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params->n_rank_w3 = std::stoi(argv[i]);
|
params->n_rank_ffn_up = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_w3 = true;
|
params->custom_n_rank_ffn_up = true;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
train_print_usage(argc, argv, &default_params);
|
train_print_usage(argc, argv, &default_params);
|
||||||
|
@ -1513,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
|
||||||
nx += ggml_nelements(layer.wo_b);
|
nx += ggml_nelements(layer.wo_b);
|
||||||
nx += ggml_nelements(layer.ffn_norm_a);
|
nx += ggml_nelements(layer.ffn_norm_a);
|
||||||
nx += ggml_nelements(layer.ffn_norm_b);
|
nx += ggml_nelements(layer.ffn_norm_b);
|
||||||
nx += ggml_nelements(layer.w1_a);
|
nx += ggml_nelements(layer.ffn_gate_a);
|
||||||
nx += ggml_nelements(layer.w1_b);
|
nx += ggml_nelements(layer.ffn_gate_b);
|
||||||
nx += ggml_nelements(layer.w2_a);
|
nx += ggml_nelements(layer.ffn_down_a);
|
||||||
nx += ggml_nelements(layer.w2_b);
|
nx += ggml_nelements(layer.ffn_down_b);
|
||||||
nx += ggml_nelements(layer.w3_a);
|
nx += ggml_nelements(layer.ffn_up_a);
|
||||||
nx += ggml_nelements(layer.w3_b);
|
nx += ggml_nelements(layer.ffn_up_b);
|
||||||
}
|
}
|
||||||
return nx;
|
return nx;
|
||||||
}
|
}
|
||||||
|
@ -1572,9 +1511,9 @@ int main(int argc, char ** argv) {
|
||||||
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
|
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
|
||||||
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
|
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
|
||||||
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
|
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
|
||||||
uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r;
|
uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r;
|
||||||
uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r;
|
uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r;
|
||||||
uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r;
|
uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r;
|
||||||
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
|
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
|
||||||
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
|
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
|
||||||
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
|
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
|
||||||
|
@ -1584,9 +1523,9 @@ int main(int argc, char ** argv) {
|
||||||
lora.hparams.n_rank_wv = n_rank_wv;
|
lora.hparams.n_rank_wv = n_rank_wv;
|
||||||
lora.hparams.n_rank_wo = n_rank_wo;
|
lora.hparams.n_rank_wo = n_rank_wo;
|
||||||
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
|
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
|
||||||
lora.hparams.n_rank_w1 = n_rank_w1;
|
lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate;
|
||||||
lora.hparams.n_rank_w2 = n_rank_w2;
|
lora.hparams.n_rank_ffn_down = n_rank_ffn_down;
|
||||||
lora.hparams.n_rank_w3 = n_rank_w3;
|
lora.hparams.n_rank_ffn_up = n_rank_ffn_up;
|
||||||
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
|
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
|
||||||
lora.hparams.n_rank_norm = n_rank_norm;
|
lora.hparams.n_rank_norm = n_rank_norm;
|
||||||
lora.hparams.n_rank_output = n_rank_output;
|
lora.hparams.n_rank_output = n_rank_output;
|
||||||
|
@ -1627,9 +1566,9 @@ int main(int argc, char ** argv) {
|
||||||
|| (lora.hparams.n_rank_wv != n_rank_wv)
|
|| (lora.hparams.n_rank_wv != n_rank_wv)
|
||||||
|| (lora.hparams.n_rank_wo != n_rank_wo)
|
|| (lora.hparams.n_rank_wo != n_rank_wo)
|
||||||
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
|
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
|
||||||
|| (lora.hparams.n_rank_w1 != n_rank_w1)
|
|| (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate)
|
||||||
|| (lora.hparams.n_rank_w2 != n_rank_w2)
|
|| (lora.hparams.n_rank_ffn_down != n_rank_ffn_down)
|
||||||
|| (lora.hparams.n_rank_w3 != n_rank_w3)
|
|| (lora.hparams.n_rank_ffn_up != n_rank_ffn_up)
|
||||||
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|
||||||
|| (lora.hparams.n_rank_norm != n_rank_norm)
|
|| (lora.hparams.n_rank_norm != n_rank_norm)
|
||||||
|| (lora.hparams.n_rank_output != n_rank_output)
|
|| (lora.hparams.n_rank_output != n_rank_output)
|
||||||
|
@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
||||||
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
||||||
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
||||||
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
|
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
|
||||||
|
|
||||||
if (params.only_write_lora) {
|
if (params.only_write_lora) {
|
||||||
save_train_files_data save_data;
|
save_train_files_data save_data;
|
||||||
|
@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
int n_batch = params.common.n_batch;
|
int n_batch = params.common.n_batch;
|
||||||
|
|
||||||
|
|
||||||
std::vector<uint8_t> mem_input_data;
|
|
||||||
std::vector<uint8_t> mem_compute_data;
|
|
||||||
|
|
||||||
// context for input tensors without their data
|
// context for input tensors without their data
|
||||||
struct ggml_init_params ctx_input_params = {
|
struct ggml_init_params ctx_input_params = {
|
||||||
ggml_tensor_overhead() * 2, // mem_size
|
ggml_tensor_overhead() * 2, // mem_size
|
||||||
|
@ -1706,17 +1641,11 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
|
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
|
||||||
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
||||||
|
|
||||||
// measure required memory for input tensors
|
|
||||||
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
|
|
||||||
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
|
|
||||||
tensor_alignment;
|
|
||||||
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
|
||||||
|
|
||||||
// allocate input tensors
|
// allocate input tensors
|
||||||
mem_input_data.resize(max_input_size);
|
// measure required memory for input tensors
|
||||||
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
|
||||||
ggml_allocr_alloc(alloc_inps, tokens_input);
|
size_t max_input_size = ggml_backend_buffer_get_size(input_data);
|
||||||
ggml_allocr_alloc(alloc_inps, target_probs);
|
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
||||||
|
|
||||||
// context for compute tensors without their data
|
// context for compute tensors without their data
|
||||||
const size_t estimated_compute_size_wo_data = (
|
const size_t estimated_compute_size_wo_data = (
|
||||||
|
@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
|
||||||
// find best evaluation order
|
// find best evaluation order
|
||||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
|
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
|
||||||
&logits, tokens_input, target_probs,
|
&logits, tokens_input, target_probs,
|
||||||
n_tokens, n_batch,
|
n_tokens, n_batch,
|
||||||
params.common.use_flash,
|
params.common.use_flash,
|
||||||
params.common.use_checkpointing
|
params.common.use_checkpointing,
|
||||||
|
true
|
||||||
);
|
);
|
||||||
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
|
||||||
if (max_compute_size < best_compute_size) {
|
if (max_compute_size < best_compute_size) {
|
||||||
best_compute_size = max_compute_size;
|
best_compute_size = max_compute_size;
|
||||||
best_order = gf->order;
|
best_order = gf->order;
|
||||||
}
|
}
|
||||||
ggml_allocr_free(alloc);
|
ggml_gallocr_free(alloc);
|
||||||
ggml_free(ctx_compute);
|
ggml_free(ctx_compute);
|
||||||
}
|
}
|
||||||
size_t max_compute_size = best_compute_size;
|
size_t max_compute_size = best_compute_size;
|
||||||
|
@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
|
||||||
"invalid");
|
"invalid");
|
||||||
|
|
||||||
// allocate compute tensors
|
// allocate compute tensors
|
||||||
mem_compute_data.resize(max_compute_size);
|
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = best_order;
|
gf->order = best_order;
|
||||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
|
||||||
&logits, tokens_input, target_probs,
|
&logits, tokens_input, target_probs,
|
||||||
n_tokens, n_batch,
|
n_tokens, n_batch,
|
||||||
params.common.use_flash,
|
params.common.use_flash,
|
||||||
params.common.use_checkpointing
|
params.common.use_checkpointing,
|
||||||
|
false
|
||||||
);
|
);
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
ggml_allocr_free(alloc_inps);
|
|
||||||
|
|
||||||
|
|
||||||
// tokenize data
|
// tokenize data
|
||||||
std::vector<llama_token> train_tokens;
|
std::vector<llama_token> train_tokens;
|
||||||
|
@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
|
||||||
ggml_free(ctx_work);
|
ggml_free(ctx_work);
|
||||||
ggml_free(ctx_compute);
|
ggml_free(ctx_compute);
|
||||||
ggml_free(ctx_input);
|
ggml_free(ctx_input);
|
||||||
|
ggml_gallocr_free(alloc);
|
||||||
|
|
||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
printf("%s: total training time: ", __func__);
|
printf("%s: total training time: ", __func__);
|
||||||
|
|
|
@ -367,7 +367,7 @@ struct clip_ctx {
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
ggml_backend_buffer_t compute_buffer = NULL;
|
ggml_backend_buffer_t compute_buffer = NULL;
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_allocr * compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||||
|
@ -405,31 +405,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
||||||
ggml_allocr_alloc(ctx->compute_alloc, inp_raw);
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
|
ggml_set_input(inp_raw);
|
||||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
|
||||||
float * data = (float *)malloc(ggml_nbytes(inp_raw));
|
|
||||||
|
|
||||||
for (size_t i = 0; i < imgs->size; i++) {
|
|
||||||
const int nx = imgs->data[i].nx;
|
|
||||||
const int ny = imgs->data[i].ny;
|
|
||||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
|
||||||
|
|
||||||
const int n = nx * ny;
|
|
||||||
|
|
||||||
for (int b = 0; b < batch_size; b++) {
|
|
||||||
for (int k = 0; k < 3; k++) {
|
|
||||||
for (int y = 0; y < ny; y++) {
|
|
||||||
for (int x = 0; x < nx; x++) {
|
|
||||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
|
||||||
free(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
@ -438,13 +415,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
|
|
||||||
// concat class_embeddings and patch_embeddings
|
// concat class_embeddings and patch_embeddings
|
||||||
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
ggml_allocr_alloc(ctx->compute_alloc, embeddings);
|
ggml_set_name(embeddings, "embeddings");
|
||||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
ggml_set_input(embeddings);
|
||||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
|
||||||
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
|
||||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
|
||||||
free(zero_mem);
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||||
|
@ -453,15 +425,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
ggml_allocr_alloc(ctx->compute_alloc, positions);
|
ggml_set_name(positions, "positions");
|
||||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
ggml_set_input(positions);
|
||||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
||||||
for (int i = 0; i < num_positions; i++) {
|
|
||||||
positions_data[i] = i;
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
||||||
free(positions_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddings =
|
embeddings =
|
||||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||||
|
@ -560,15 +525,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||||
|
|
||||||
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
||||||
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
ggml_set_name(patches, "patches");
|
||||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
ggml_set_input(patches);
|
||||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
||||||
for (int i = 0; i < num_patches; i++) {
|
|
||||||
patches_data[i] = i + 1;
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
||||||
free(patches_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
// shape [1, 576, 1024]
|
// shape [1, 576, 1024]
|
||||||
// ne is whcn, ne = [1024, 576, 1, 1]
|
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||||
|
@ -809,7 +767,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// data
|
// data
|
||||||
size_t buffer_size = 0;
|
size_t model_size = 0;
|
||||||
{
|
{
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
|
@ -817,7 +775,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
||||||
size_t tensor_size = ggml_nbytes(cur);
|
size_t tensor_size = ggml_nbytes(cur);
|
||||||
buffer_size += tensor_size;
|
model_size += tensor_size;
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||||
|
@ -825,8 +783,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer_size += n_tensors * 128 /* CLIP PADDING */;
|
|
||||||
|
|
||||||
clip_ctx * new_clip = new clip_ctx;
|
clip_ctx * new_clip = new clip_ctx;
|
||||||
|
|
||||||
// update projector type
|
// update projector type
|
||||||
|
@ -886,12 +842,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
|
printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
|
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||||
|
|
||||||
// load tensors
|
// load tensors
|
||||||
{
|
{
|
||||||
|
@ -925,12 +881,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// alloc memory and offload data
|
// alloc memory and offload data
|
||||||
new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
|
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
|
||||||
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||||
ggml_allocr_alloc(alloc, cur);
|
|
||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
fin.seekg(offset, std::ios::beg);
|
fin.seekg(offset, std::ios::beg);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
|
@ -949,7 +903,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_allocr_free(alloc);
|
|
||||||
fin.close();
|
fin.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1077,15 +1030,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
// measure mem requirement and allocate
|
// measure mem requirement and allocate
|
||||||
{
|
{
|
||||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||||
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
||||||
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
ggml_allocr_free(new_clip->compute_alloc);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
|
|
||||||
new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
|
|
||||||
|
|
||||||
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1267,12 +1217,72 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset alloc buffer to clean the memory from previous invocations
|
|
||||||
ggml_allocr_reset(ctx->compute_alloc);
|
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||||
ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
|
|
||||||
|
// set inputs
|
||||||
|
const auto & model = ctx->vision_model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
const int image_size = hparams.image_size;
|
||||||
|
const int patch_size = hparams.patch_size;
|
||||||
|
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||||
|
const int num_positions = num_patches + 1;
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
||||||
|
float * data = (float *)malloc(ggml_nbytes(inp_raw));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < imgs->size; i++) {
|
||||||
|
const int nx = imgs->data[i].nx;
|
||||||
|
const int ny = imgs->data[i].ny;
|
||||||
|
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||||
|
|
||||||
|
const int n = nx * ny;
|
||||||
|
|
||||||
|
for (int b = 0; b < batch_size; b++) {
|
||||||
|
for (int k = 0; k < 3; k++) {
|
||||||
|
for (int y = 0; y < ny; y++) {
|
||||||
|
for (int x = 0; x < nx; x++) {
|
||||||
|
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||||
|
|
||||||
|
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||||
|
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
||||||
|
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||||
|
free(zero_mem);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
||||||
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||||
|
for (int i = 0; i < num_positions; i++) {
|
||||||
|
positions_data[i] = i;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
|
free(positions_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||||
|
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||||
|
for (int i = 0; i < num_patches; i++) {
|
||||||
|
patches_data[i] = i + 1;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||||
|
free(patches_data);
|
||||||
|
}
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
|
|
|
@ -71,7 +71,7 @@ def bytes_to_unicode():
|
||||||
return dict(zip(bs, cs))
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
|
|
||||||
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||||
ap.add_argument("--text-only", action="store_true", required=False,
|
ap.add_argument("--text-only", action="store_true", required=False,
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -73,6 +75,8 @@ int main(int argc, char ** argv){
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
||||||
|
int64_t t_draft_us = 0;
|
||||||
|
|
||||||
int n_past = inp.size();
|
int n_past = inp.size();
|
||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
@ -160,7 +164,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// generate n_pred tokens through prompt lookup
|
// generate n_pred tokens through prompt lookup
|
||||||
auto prompt_lookup = [&]() -> void {
|
auto prompt_lookup = [&]() -> void {
|
||||||
int inp_size = inp.size();
|
const int inp_size = inp.size();
|
||||||
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
||||||
const llama_token * ngram = &inp[inp_size - ngram_size];
|
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||||
|
|
||||||
|
@ -191,8 +195,12 @@ int main(int argc, char ** argv){
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
|
|
||||||
prompt_lookup();
|
prompt_lookup();
|
||||||
|
|
||||||
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
|
|
||||||
llama_decode(ctx, batch_tgt);
|
llama_decode(ctx, batch_tgt);
|
||||||
++n_past;
|
++n_past;
|
||||||
|
|
||||||
|
@ -210,6 +218,8 @@ int main(int argc, char ** argv){
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_TEE("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||||
|
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
|
|
|
@ -98,7 +98,7 @@ static void write_logfile(
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
static void sigint_handler(int signo) {
|
static void sigint_handler(int signo) {
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting && g_params->interactive) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
|
@ -392,7 +392,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
// ctrl+C handling
|
||||||
|
{
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
sigint_action.sa_handler = sigint_handler;
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
@ -405,7 +406,9 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.interactive) {
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
|
|
|
@ -185,7 +185,7 @@ node index.js
|
||||||
|
|
||||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
|
|
@ -15,9 +15,13 @@
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
inline static json oaicompat_completion_params_parse(
|
inline static json oaicompat_completion_params_parse(
|
||||||
const json &body /* openai api json semantics */)
|
const json &body, /* openai api json semantics */
|
||||||
|
const std::string &chat_template)
|
||||||
{
|
{
|
||||||
json llama_params;
|
json llama_params;
|
||||||
|
std::string formatted_prompt = chat_template == "chatml"
|
||||||
|
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
|
||||||
|
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
@ -30,7 +34,7 @@ inline static json oaicompat_completion_params_parse(
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
// https://platform.openai.com/docs/api-reference/chat/create
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
llama_params["prompt"] = formatted_prompt;
|
||||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||||
|
|
|
@ -36,6 +36,7 @@ struct server_params
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
|
std::string chat_template = "chatml";
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
@ -625,18 +626,36 @@ struct llama_server_context
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
for (const auto &el : *logit_bias)
|
for (const auto &el : *logit_bias)
|
||||||
{
|
{
|
||||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
if (el.is_array() && el.size() == 2)
|
||||||
|
{
|
||||||
|
float bias;
|
||||||
|
if (el[1].is_number())
|
||||||
|
{
|
||||||
|
bias = el[1].get<float>();
|
||||||
|
}
|
||||||
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
|
{
|
||||||
|
bias = -INFINITY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el[0].is_number_integer())
|
||||||
{
|
{
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
{
|
|
||||||
slot->sparams.logit_bias[tok] = el[1].get<float>();
|
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
}
|
||||||
|
else if (el[0].is_string())
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = -INFINITY;
|
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||||
|
for (auto tok : toks)
|
||||||
|
{
|
||||||
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1859,6 +1878,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||||
|
printf(" --chat-template FORMAT_NAME");
|
||||||
|
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2290,6 +2311,21 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
LOG_INFO("logging to file is disabled.", {});
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
}
|
}
|
||||||
|
else if (arg == "--chat-template")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
if (value != "chatml" && value != "llama2") {
|
||||||
|
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.chat_template = value;
|
||||||
|
}
|
||||||
else if (arg == "--override-kv")
|
else if (arg == "--override-kv")
|
||||||
{
|
{
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -2743,13 +2779,13 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
|
|
||||||
// TODO: add mount point without "/v1" prefix -- how?
|
// TODO: add mount point without "/v1" prefix -- how?
|
||||||
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
if (!validate_api_key(req, res)) {
|
if (!validate_api_key(req, res)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
|
||||||
|
|
||||||
const int task_id = llama.queue_tasks.get_new_id();
|
const int task_id = llama.queue_tasks.get_new_id();
|
||||||
llama.queue_results.add_waiting_task_id(task_id);
|
llama.queue_results.add_waiting_task_id(task_id);
|
||||||
|
|
|
@ -167,6 +167,34 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
: default_value;
|
: default_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::string format_llama2(std::vector<json> messages)
|
||||||
|
{
|
||||||
|
std::ostringstream output;
|
||||||
|
bool is_inside_turn = false;
|
||||||
|
|
||||||
|
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||||
|
if (!is_inside_turn) {
|
||||||
|
output << "[INST] ";
|
||||||
|
}
|
||||||
|
std::string role = json_value(*it, "role", std::string("user"));
|
||||||
|
std::string content = json_value(*it, "content", std::string(""));
|
||||||
|
if (role == "system") {
|
||||||
|
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
|
||||||
|
is_inside_turn = true;
|
||||||
|
} else if (role == "user") {
|
||||||
|
output << content << " [/INST]";
|
||||||
|
is_inside_turn = true;
|
||||||
|
} else {
|
||||||
|
output << " " << content << " </s>";
|
||||||
|
is_inside_turn = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
|
||||||
|
|
||||||
|
return output.str();
|
||||||
|
}
|
||||||
|
|
||||||
inline std::string format_chatml(std::vector<json> messages)
|
inline std::string format_chatml(std::vector<json> messages)
|
||||||
{
|
{
|
||||||
std::ostringstream chatml_msgs;
|
std::ostringstream chatml_msgs;
|
||||||
|
@ -180,6 +208,8 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||||
|
|
||||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||||
|
|
||||||
|
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
|
||||||
|
|
||||||
return chatml_msgs.str();
|
return chatml_msgs.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "train.h"
|
#include "train.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -19,8 +20,6 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static const size_t tensor_alignment = 32;
|
|
||||||
|
|
||||||
struct my_llama_hparams {
|
struct my_llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512;
|
uint32_t n_ctx = 512;
|
||||||
|
@ -51,14 +50,14 @@ struct my_llama_layer {
|
||||||
struct ggml_tensor * ffn_norm;
|
struct ggml_tensor * ffn_norm;
|
||||||
|
|
||||||
// ff
|
// ff
|
||||||
struct ggml_tensor * w1;
|
struct ggml_tensor * ffn_gate; // w1
|
||||||
struct ggml_tensor * w2;
|
struct ggml_tensor * ffn_down; // w2
|
||||||
struct ggml_tensor * w3;
|
struct ggml_tensor * ffn_up; // w3
|
||||||
};
|
};
|
||||||
|
|
||||||
struct my_llama_model {
|
struct my_llama_model {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
std::vector<uint8_t> data;
|
ggml_backend_buffer_t data = NULL;
|
||||||
|
|
||||||
my_llama_hparams hparams;
|
my_llama_hparams hparams;
|
||||||
|
|
||||||
|
@ -141,42 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
|
||||||
ggml_set_param(ctx, layer.wv);
|
ggml_set_param(ctx, layer.wv);
|
||||||
ggml_set_param(ctx, layer.wo);
|
ggml_set_param(ctx, layer.wo);
|
||||||
ggml_set_param(ctx, layer.ffn_norm);
|
ggml_set_param(ctx, layer.ffn_norm);
|
||||||
ggml_set_param(ctx, layer.w1);
|
ggml_set_param(ctx, layer.ffn_gate);
|
||||||
ggml_set_param(ctx, layer.w2);
|
ggml_set_param(ctx, layer.ffn_down);
|
||||||
ggml_set_param(ctx, layer.w3);
|
ggml_set_param(ctx, layer.ffn_up);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
|
|
||||||
ggml_allocr_alloc(alloc, model->tok_embeddings);
|
|
||||||
ggml_allocr_alloc(alloc, model->norm);
|
|
||||||
ggml_allocr_alloc(alloc, model->output);
|
|
||||||
for (uint32_t i = 0; i < model->layers.size(); ++i) {
|
|
||||||
auto & layer = model->layers[i];
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3);
|
|
||||||
}
|
|
||||||
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
|
|
||||||
ggml_allocr_alloc(alloc, model->norm->grad);
|
|
||||||
ggml_allocr_alloc(alloc, model->output->grad);
|
|
||||||
for (uint32_t i = 0; i < model->layers.size(); ++i) {
|
|
||||||
auto & layer = model->layers[i];
|
|
||||||
ggml_allocr_alloc(alloc, layer.attention_norm->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wq->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wk->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wv->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.wo->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w1->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w2->grad);
|
|
||||||
ggml_allocr_alloc(alloc, layer.w3->grad);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -232,9 +198,9 @@ static void init_model(struct my_llama_model * model) {
|
||||||
|
|
||||||
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||||
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||||
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||||
|
|
||||||
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
|
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
|
||||||
|
|
||||||
|
@ -245,24 +211,15 @@ static void init_model(struct my_llama_model * model) {
|
||||||
|
|
||||||
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
|
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
|
||||||
|
|
||||||
ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i));
|
ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
|
||||||
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
|
ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||||
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
|
ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
set_param_model(model);
|
set_param_model(model);
|
||||||
|
|
||||||
// measure data size
|
|
||||||
size_t size = 0;
|
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
||||||
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
|
|
||||||
}
|
|
||||||
|
|
||||||
// allocate data
|
// allocate data
|
||||||
struct ggml_allocr * alloc = NULL;
|
model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
|
||||||
model->data.resize(size + tensor_alignment);
|
|
||||||
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
|
||||||
alloc_model(alloc, model);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
|
@ -287,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
|
||||||
|
|
||||||
randomize_tensor_normal(layer.ffn_norm, rnd);
|
randomize_tensor_normal(layer.ffn_norm, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1, rnd);
|
randomize_tensor_normal(layer.ffn_gate, rnd);
|
||||||
randomize_tensor_normal(layer.w2, rnd);
|
randomize_tensor_normal(layer.ffn_down, rnd);
|
||||||
randomize_tensor_normal(layer.w3, rnd);
|
randomize_tensor_normal(layer.ffn_up, rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_random_normal_distribution(rnd);
|
free_random_normal_distribution(rnd);
|
||||||
|
@ -297,7 +254,7 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
|
||||||
|
|
||||||
static struct ggml_tensor * llama_build_train_graphs(
|
static struct ggml_tensor * llama_build_train_graphs(
|
||||||
struct my_llama_model * model,
|
struct my_llama_model * model,
|
||||||
struct ggml_allocr * alloc,
|
ggml_gallocr_t alloc,
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
|
@ -308,7 +265,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_batch,
|
const int n_batch,
|
||||||
const bool enable_flash_attn,
|
const bool enable_flash_attn,
|
||||||
const bool enable_checkpointing) {
|
const bool enable_checkpointing,
|
||||||
|
const bool measure_only) {
|
||||||
|
|
||||||
ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
||||||
const int n_past = 0;
|
const int n_past = 0;
|
||||||
|
@ -334,13 +292,7 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
|
|
||||||
// KQ_pos - contains the positions
|
// KQ_pos - contains the positions
|
||||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
|
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
|
||||||
ggml_allocr_alloc(alloc, KQ_pos);
|
ggml_set_input(KQ_pos);
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
int * data = (int *) KQ_pos->data;
|
|
||||||
for (int i = 0; i < N; ++i) {
|
|
||||||
data[i] = n_past + i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// rope has so much parameters that we make a custom function for it
|
// rope has so much parameters that we make a custom function for it
|
||||||
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||||
|
@ -404,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
||||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
||||||
cur = t30;
|
cur = t30;
|
||||||
checkpoints.push_back(cur);
|
checkpoints.push_back(cur);
|
||||||
|
@ -448,21 +400,31 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
// KQ_pos
|
// KQ_pos
|
||||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
||||||
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
||||||
|
ggml_set_input(t36->grad);
|
||||||
ggml_allocr_alloc(alloc, t36->grad);
|
|
||||||
|
|
||||||
// allocating checkpoints in one block to reduce memory fragmentation
|
// allocating checkpoints in one block to reduce memory fragmentation
|
||||||
// note: they will be freed in reverse order
|
// note: they will be freed in reverse order
|
||||||
for (int i = 0; i < (int) checkpoints.size(); ++i) {
|
for (int i = 0; i < (int) checkpoints.size(); ++i) {
|
||||||
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
|
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
|
||||||
ggml_allocr_alloc(alloc, checkpoints[i]);
|
ggml_set_input(checkpoints[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//int n_leafs_after = gb->n_leafs;
|
//int n_leafs_after = gb->n_leafs;
|
||||||
//int n_nodes_after = gb->n_nodes;
|
//int n_nodes_after = gb->n_nodes;
|
||||||
|
if (measure_only) {
|
||||||
|
// FIXME: will still allocate
|
||||||
|
ggml_gallocr_reserve(alloc, gb);
|
||||||
|
} else {
|
||||||
|
ggml_gallocr_alloc_graph(alloc, gb);
|
||||||
|
|
||||||
ggml_allocr_alloc_graph(alloc, gb);
|
if (!measure_only) {
|
||||||
|
int * data = (int *) KQ_pos->data;
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
data[i] = n_past + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// remove the additional nodes and leafs
|
// remove the additional nodes and leafs
|
||||||
for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
|
for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
|
||||||
|
@ -559,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
|
||||||
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
|
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
|
||||||
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
|
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
|
||||||
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
|
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
|
||||||
copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
|
copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
|
||||||
copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
|
copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||||
copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
|
copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -702,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
|
||||||
gguf_add_tensor(fctx, layer.wv);
|
gguf_add_tensor(fctx, layer.wv);
|
||||||
gguf_add_tensor(fctx, layer.wo);
|
gguf_add_tensor(fctx, layer.wo);
|
||||||
gguf_add_tensor(fctx, layer.ffn_norm);
|
gguf_add_tensor(fctx, layer.ffn_norm);
|
||||||
gguf_add_tensor(fctx, layer.w1);
|
gguf_add_tensor(fctx, layer.ffn_gate);
|
||||||
gguf_add_tensor(fctx, layer.w2);
|
gguf_add_tensor(fctx, layer.ffn_down);
|
||||||
gguf_add_tensor(fctx, layer.w3);
|
gguf_add_tensor(fctx, layer.ffn_up);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -953,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
|
||||||
nx += ggml_nelements(layer.wv);
|
nx += ggml_nelements(layer.wv);
|
||||||
nx += ggml_nelements(layer.wo);
|
nx += ggml_nelements(layer.wo);
|
||||||
nx += ggml_nelements(layer.ffn_norm);
|
nx += ggml_nelements(layer.ffn_norm);
|
||||||
nx += ggml_nelements(layer.w1);
|
nx += ggml_nelements(layer.ffn_gate);
|
||||||
nx += ggml_nelements(layer.w2);
|
nx += ggml_nelements(layer.ffn_down);
|
||||||
nx += ggml_nelements(layer.w3);
|
nx += ggml_nelements(layer.ffn_up);
|
||||||
}
|
}
|
||||||
return nx;
|
return nx;
|
||||||
}
|
}
|
||||||
|
@ -1046,7 +1008,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
|
||||||
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
|
||||||
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
|
||||||
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
|
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
|
||||||
|
|
||||||
if (params.only_write_model) {
|
if (params.only_write_model) {
|
||||||
save_train_files_data save_data;
|
save_train_files_data save_data;
|
||||||
|
@ -1073,11 +1035,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
int n_batch = params.common.n_batch;
|
int n_batch = params.common.n_batch;
|
||||||
|
|
||||||
std::vector<uint8_t> mem_input_data;
|
|
||||||
std::vector<uint8_t> mem_compute_data;
|
|
||||||
|
|
||||||
ggml_allocr * alloc = NULL;
|
|
||||||
|
|
||||||
// context for input tensors without their data
|
// context for input tensors without their data
|
||||||
struct ggml_init_params ctx_input_params = {
|
struct ggml_init_params ctx_input_params = {
|
||||||
ggml_tensor_overhead() * 2, // mem_size
|
ggml_tensor_overhead() * 2, // mem_size
|
||||||
|
@ -1091,16 +1048,10 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
|
||||||
|
|
||||||
// measure required memory for input tensors
|
// measure required memory for input tensors
|
||||||
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
|
|
||||||
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
|
|
||||||
tensor_alignment;
|
|
||||||
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
|
||||||
|
|
||||||
// allocate input tensors
|
// allocate input tensors
|
||||||
mem_input_data.resize(max_input_size);
|
ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
|
||||||
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
size_t max_input_size = ggml_backend_buffer_get_size(input_data);
|
||||||
ggml_allocr_alloc(alloc, tokens_input);
|
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
|
||||||
ggml_allocr_alloc(alloc, target_probs);
|
|
||||||
|
|
||||||
// context for compute tensors without their data
|
// context for compute tensors without their data
|
||||||
const size_t estimated_compute_size_wo_data = (
|
const size_t estimated_compute_size_wo_data = (
|
||||||
|
@ -1127,7 +1078,7 @@ int main(int argc, char ** argv) {
|
||||||
// find best evaluation order
|
// find best evaluation order
|
||||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
@ -1140,9 +1091,10 @@ int main(int argc, char ** argv) {
|
||||||
&logits, tokens_input, target_probs,
|
&logits, tokens_input, target_probs,
|
||||||
n_tokens, n_batch,
|
n_tokens, n_batch,
|
||||||
params.common.use_flash,
|
params.common.use_flash,
|
||||||
params.common.use_checkpointing
|
params.common.use_checkpointing,
|
||||||
|
true
|
||||||
);
|
);
|
||||||
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
|
size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
|
||||||
if (max_compute_size < best_compute_size) {
|
if (max_compute_size < best_compute_size) {
|
||||||
best_compute_size = max_compute_size;
|
best_compute_size = max_compute_size;
|
||||||
best_order = gf->order;
|
best_order = gf->order;
|
||||||
|
@ -1157,9 +1109,8 @@ int main(int argc, char ** argv) {
|
||||||
"invalid");
|
"invalid");
|
||||||
|
|
||||||
// allocate compute tensors
|
// allocate compute tensors
|
||||||
mem_compute_data.resize(max_compute_size);
|
|
||||||
ctx_compute = ggml_init(ctx_compute_params);
|
ctx_compute = ggml_init(ctx_compute_params);
|
||||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
gf->order = best_order;
|
gf->order = best_order;
|
||||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
@ -1172,7 +1123,8 @@ int main(int argc, char ** argv) {
|
||||||
&logits, tokens_input, target_probs,
|
&logits, tokens_input, target_probs,
|
||||||
n_tokens, n_batch,
|
n_tokens, n_batch,
|
||||||
params.common.use_flash,
|
params.common.use_flash,
|
||||||
params.common.use_checkpointing
|
params.common.use_checkpointing,
|
||||||
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
std::vector<llama_token> train_tokens;
|
std::vector<llama_token> train_tokens;
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1706732774,
|
"lastModified": 1707268954,
|
||||||
"narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=",
|
"narHash": "sha256-2en1kvde3cJVc3ZnTy8QeD2oKcseLFjYPLKhIGDanQ0=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d",
|
"rev": "f8e2ebd66d097614d51a56a755450d4ae1632df1",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
1235
ggml-alloc.c
1235
ggml-alloc.c
File diff suppressed because it is too large
Load diff
104
ggml-alloc.h
104
ggml-alloc.h
|
@ -6,88 +6,62 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_backend;
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||||
struct ggml_backend_buffer;
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
struct ggml_backend_buffer_type;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
//
|
|
||||||
// Legacy API
|
|
||||||
//
|
|
||||||
|
|
||||||
typedef struct ggml_allocr * ggml_allocr_t;
|
|
||||||
|
|
||||||
// initialize allocator for use with CPU backend only
|
|
||||||
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
|
|
||||||
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
|
|
||||||
|
|
||||||
// initialize allocator for use with ggml-backend
|
|
||||||
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
|
||||||
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
|
||||||
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
|
|
||||||
|
|
||||||
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
|
|
||||||
|
|
||||||
// tell the allocator to parse nodes following the order described in the list
|
|
||||||
// you should call this if your graph are optimized to execute out-of-order
|
|
||||||
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
|
|
||||||
|
|
||||||
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
|
|
||||||
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
|
|
||||||
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
|
|
||||||
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
|
|
||||||
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
|
|
||||||
|
|
||||||
//
|
|
||||||
// ggml-backend v2 API
|
|
||||||
//
|
|
||||||
|
|
||||||
// Separate tensor and graph allocator objects
|
|
||||||
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
|
||||||
// The original API is kept as a wrapper around the new API
|
|
||||||
|
|
||||||
// Tensor allocator
|
// Tensor allocator
|
||||||
typedef struct ggml_tallocr * ggml_tallocr_t;
|
typedef struct ggml_tallocr * ggml_tallocr_t;
|
||||||
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
|
||||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
|
||||||
|
|
||||||
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
|
||||||
|
|
||||||
GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
|
GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
|
||||||
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
|
|
||||||
GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
|
|
||||||
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
|
|
||||||
|
|
||||||
|
|
||||||
// Graph allocator
|
// Graph allocator
|
||||||
|
/*
|
||||||
|
Example usage:
|
||||||
|
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
||||||
|
|
||||||
|
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||||
|
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||||
|
|
||||||
|
// allocate the graph
|
||||||
|
struct ggml_cgraph * graph = build_graph(batch);
|
||||||
|
ggml_gallocr_alloc_graph(galloc, graph);
|
||||||
|
|
||||||
|
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
|
||||||
|
|
||||||
|
// evaluate the graph
|
||||||
|
ggml_backend_graph_compute(backend, graph);
|
||||||
|
*/
|
||||||
|
|
||||||
|
// special tensor flags for use with the graph allocator:
|
||||||
|
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
|
||||||
|
// ggml_set_output(): output tensors are never freed and never overwritten
|
||||||
|
|
||||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||||
|
|
||||||
GGML_API ggml_gallocr_t ggml_gallocr_new(void);
|
GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
|
||||||
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
|
|
||||||
GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
|
// pre-allocate buffers from a measure graph - does not allocate or modify the graph
|
||||||
GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
|
// call with a worst-case graph to avoid buffer reallocations
|
||||||
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||||
|
// returns false if the buffer allocation failed
|
||||||
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
|
||||||
|
|
||||||
// Allocate tensors from the allocators given by the hash table
|
// automatic reallocation if the topology changes when using a single buffer
|
||||||
GGML_API void ggml_gallocr_alloc_graph_n(
|
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
||||||
ggml_gallocr_t galloc,
|
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
struct ggml_cgraph * graph,
|
|
||||||
struct ggml_hash_set hash_set,
|
|
||||||
ggml_tallocr_t * hash_node_talloc);
|
|
||||||
|
|
||||||
|
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
490
ggml-backend.c
490
ggml-backend.c
|
@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
|
||||||
|
if (!size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
||||||
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
|
||||||
|
if (!size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -475,6 +483,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
||||||
|
|
||||||
// backend CPU
|
// backend CPU
|
||||||
|
|
||||||
|
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
|
@ -482,7 +492,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return (void *)buffer->context;
|
uintptr_t data = (uintptr_t)buffer->context;
|
||||||
|
|
||||||
|
// align the buffer
|
||||||
|
if (data % TENSOR_ALIGNMENT != 0) {
|
||||||
|
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (void *)data;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
@ -540,8 +557,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
/* .reset = */ NULL,
|
/* .reset = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
|
@ -550,9 +565,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
||||||
|
if (data == NULL) {
|
||||||
GGML_ASSERT(data != NULL && "failed to allocate buffer");
|
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
||||||
}
|
}
|
||||||
|
@ -766,6 +783,9 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
||||||
|
if (ctx == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
|
@ -774,6 +794,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
ctx->abort_callback_data = NULL;
|
ctx->abort_callback_data = NULL;
|
||||||
|
|
||||||
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||||||
|
if (cpu_backend == NULL) {
|
||||||
|
free(ctx);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
*cpu_backend = (struct ggml_backend) {
|
*cpu_backend = (struct ggml_backend) {
|
||||||
/* .interface = */ cpu_backend_i,
|
/* .interface = */ cpu_backend_i,
|
||||||
|
@ -802,6 +826,7 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
|
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -865,6 +890,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
||||||
ctx->n_buffers = n_buffers;
|
ctx->n_buffers = n_buffers;
|
||||||
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
||||||
|
|
||||||
|
GGML_ASSERT(ctx->buffers != NULL);
|
||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
for (size_t i = 0; i < n_buffers; i++) {
|
for (size_t i = 0; i < n_buffers; i++) {
|
||||||
ctx->buffers[i] = buffers[i];
|
ctx->buffers[i] = buffers[i];
|
||||||
|
@ -886,6 +913,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// creates a copy of the tensor with the same memory layout
|
||||||
|
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
||||||
|
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
dup->nb[i] = tensor->nb[i];
|
||||||
|
}
|
||||||
|
return dup;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
|
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
||||||
|
}
|
||||||
|
|
||||||
// scheduler
|
// scheduler
|
||||||
|
|
||||||
|
@ -894,7 +933,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer,
|
||||||
#define GGML_MAX_SPLIT_INPUTS 16
|
#define GGML_MAX_SPLIT_INPUTS 16
|
||||||
|
|
||||||
struct ggml_backend_sched_split {
|
struct ggml_backend_sched_split {
|
||||||
ggml_tallocr_t tallocr;
|
int backend_id;
|
||||||
int i_start;
|
int i_start;
|
||||||
int i_end;
|
int i_end;
|
||||||
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
||||||
|
@ -909,15 +948,17 @@ struct ggml_backend_sched {
|
||||||
int n_backends;
|
int n_backends;
|
||||||
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
||||||
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
||||||
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
|
||||||
|
|
||||||
ggml_gallocr_t galloc;
|
ggml_gallocr_t galloc;
|
||||||
|
|
||||||
// hash keys of the nodes in the graph
|
// hash keys of the nodes in the graph
|
||||||
struct ggml_hash_set hash_set;
|
struct ggml_hash_set hash_set;
|
||||||
// hash values (arrays of [hash_set.size])
|
// hash values
|
||||||
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
|
int * tensor_backend_id;
|
||||||
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
|
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
|
||||||
|
|
||||||
|
int * node_backend_ids; // [n_nodes]
|
||||||
|
int n_nodes;
|
||||||
|
|
||||||
// copy of the graph with modified inputs
|
// copy of the graph with modified inputs
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph * graph;
|
||||||
|
@ -927,77 +968,46 @@ struct ggml_backend_sched {
|
||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback callback_eval;
|
||||||
|
void * callback_eval_user_data;
|
||||||
|
|
||||||
// align context_buffer to GGML_MEM_ALIGN
|
// align context_buffer to GGML_MEM_ALIGN
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
__declspec(align(GGML_MEM_ALIGN))
|
__declspec(align(GGML_MEM_ALIGN))
|
||||||
#else
|
#else
|
||||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||||
#endif
|
#endif
|
||||||
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback callback_eval;
|
|
||||||
void * callback_eval_user_data;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
||||||
#define node_allocr(node) sched->node_talloc[hash_id(node)]
|
#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
|
||||||
|
#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
|
||||||
|
|
||||||
static bool ggml_is_view_op(enum ggml_op op) {
|
// returns the priority of the backend, lower id is higher priority
|
||||||
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
}
|
|
||||||
|
|
||||||
// returns the priority of the backend, lower is better
|
|
||||||
static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
if (sched->backends[i] == backend) {
|
if (sched->backends[i] == backend) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return INT_MAX;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
if (sched->tallocs[i] == allocr) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return INT_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
return NULL;
|
return -1;
|
||||||
}
|
|
||||||
|
|
||||||
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
|
||||||
return sched->tallocs[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// find highest prio backend that supports the buffer type
|
// find highest prio backend that supports the buffer type
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
||||||
return sched->tallocs[i];
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
|
||||||
if (allocr == NULL) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
if (sched->tallocs[i] == allocr) {
|
|
||||||
return sched->backends[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
GGML_UNREACHABLE();
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
||||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||||
|
@ -1008,37 +1018,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// returns the backend that should be used for the node based on the current locations
|
// returns the backend that should be used for the node based on the current locations
|
||||||
static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
||||||
|
// TODO: use supports_op to check if the backend supports the op
|
||||||
|
|
||||||
// assign pre-allocated nodes to their backend
|
// assign pre-allocated nodes to their backend
|
||||||
// dst
|
// dst
|
||||||
ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
|
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
|
||||||
if (cur_allocr != NULL) {
|
if (cur_backend != -1) {
|
||||||
SET_CAUSE(node, "1.dst");
|
SET_CAUSE(node, "1.dst");
|
||||||
return cur_allocr;
|
return cur_backend;
|
||||||
}
|
}
|
||||||
// view_src
|
// view_src
|
||||||
if (node->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
|
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
|
||||||
if (cur_allocr != NULL) {
|
if (cur_backend != -1) {
|
||||||
SET_CAUSE(node, "1.vsrc");
|
SET_CAUSE(node, "1.vsrc");
|
||||||
return cur_allocr;
|
return cur_backend;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// assign nodes that use weights to the backend of the weights
|
// assign nodes that use weights to the backend of the weights
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
const struct ggml_tensor * src = node->src[i];
|
const struct ggml_tensor * src = tensor->src[i];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
|
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
||||||
// operations with weights are always run on the same backend as the weights
|
// operations with weights are always run on the same backend as the weights
|
||||||
SET_CAUSE(node, "1.wgt%d", i);
|
SET_CAUSE(node, "1.wgt%d", i);
|
||||||
return src_allocr;
|
return src_backend;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char * fmt_size(size_t size) {
|
static char * fmt_size(size_t size) {
|
||||||
|
@ -1051,11 +1063,11 @@ static char * fmt_size(size_t size) {
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
int cur_split = 0;
|
int cur_split = 0;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||||
ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
||||||
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
||||||
sched->splits[cur_split].n_inputs);
|
sched->splits[cur_split].n_inputs);
|
||||||
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||||
|
@ -1069,17 +1081,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
ggml_backend_t tensor_backend = tensor_backend(node);
|
||||||
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
|
||||||
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||||
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
ggml_backend_t src_backend = tensor_backend(src);
|
||||||
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
|
||||||
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||||
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
||||||
}
|
}
|
||||||
|
@ -1087,23 +1097,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// creates a copy of the tensor with the same memory layout
|
|
||||||
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
|
||||||
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
|
||||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
||||||
dup->nb[i] = tensor->nb[i];
|
|
||||||
}
|
|
||||||
return dup;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//#define DEBUG_PASS1
|
//#define DEBUG_PASS1
|
||||||
//#define DEBUG_PASS2
|
//#define DEBUG_PASS2
|
||||||
//#define DEBUG_PASS3
|
//#define DEBUG_PASS3
|
||||||
//#define DEBUG_PASS4
|
//#define DEBUG_PASS4
|
||||||
|
|
||||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||||
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
// reset splits
|
// reset splits
|
||||||
sched->n_splits = 0;
|
sched->n_splits = 0;
|
||||||
sched->is_reset = false;
|
sched->is_reset = false;
|
||||||
|
@ -1125,28 +1125,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
// pass 1: assign backends to ops with pre-allocated inputs
|
// pass 1: assign backends to ops with pre-allocated inputs
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
if (node_allocr(leaf) != NULL) {
|
if (tensor_backend_id(leaf) != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
|
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (node_allocr(node) != NULL) {
|
if (tensor_backend_id(node) != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
node_allocr(node) = sched_allocr_from_cur(sched, node);
|
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||||
// src
|
// src
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (node_allocr(src) == NULL) {
|
if (tensor_backend_id(src) == -1) {
|
||||||
node_allocr(src) = sched_allocr_from_cur(sched, src);
|
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1161,22 +1161,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
|
|
||||||
// pass 2.1 expand gpu up
|
// pass 2.1 expand gpu up
|
||||||
{
|
{
|
||||||
ggml_tallocr_t cur_allocr = NULL;
|
int cur_backend_id = -1;
|
||||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
int tensor_backend_id = tensor_backend_id(node);
|
||||||
if (node_allocr != NULL) {
|
if (tensor_backend_id != -1) {
|
||||||
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
if (tensor_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_allocr = NULL;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_allocr = node_allocr;
|
cur_backend_id = tensor_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
node_allocr(node) = cur_allocr;
|
tensor_backend_id(node) = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.1");
|
SET_CAUSE(node, "2.1");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1184,22 +1184,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
|
|
||||||
// pass 2.2 expand gpu down
|
// pass 2.2 expand gpu down
|
||||||
{
|
{
|
||||||
ggml_tallocr_t cur_allocr = NULL;
|
int cur_backend_id = -1;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
int tensor_backend_id = tensor_backend_id(node);
|
||||||
if (node_allocr != NULL) {
|
if (tensor_backend_id != -1) {
|
||||||
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
if (tensor_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_allocr = NULL;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_allocr = node_allocr;
|
cur_backend_id = tensor_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
node_allocr(node) = cur_allocr;
|
tensor_backend_id(node) = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.2");
|
SET_CAUSE(node, "2.2");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1207,17 +1207,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
|
|
||||||
// pass 2.3 expand rest up
|
// pass 2.3 expand rest up
|
||||||
{
|
{
|
||||||
ggml_tallocr_t cur_allocr = NULL;
|
int cur_backend_id = -1;
|
||||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
int tensor_backend_id = tensor_backend_id(node);
|
||||||
if (node_allocr != NULL) {
|
if (tensor_backend_id != -1) {
|
||||||
cur_allocr = node_allocr;
|
cur_backend_id = tensor_backend_id;
|
||||||
} else {
|
} else {
|
||||||
node_allocr(node) = cur_allocr;
|
tensor_backend_id(node) = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.3");
|
SET_CAUSE(node, "2.3");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1225,17 +1225,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
|
|
||||||
// pass 2.4 expand rest down
|
// pass 2.4 expand rest down
|
||||||
{
|
{
|
||||||
ggml_tallocr_t cur_allocr = NULL;
|
int cur_backend_id = -1;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
int tensor_backend_id = tensor_backend_id(node);
|
||||||
if (node_allocr != NULL) {
|
if (tensor_backend_id != -1) {
|
||||||
cur_allocr = node_allocr;
|
cur_backend_id = tensor_backend_id;
|
||||||
} else {
|
} else {
|
||||||
node_allocr(node) = cur_allocr;
|
tensor_backend_id(node) = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.4");
|
SET_CAUSE(node, "2.4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1247,9 +1247,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
// pass 3: assign backends to remaining src from dst and view_src
|
// pass 3: assign backends to remaining src from dst and view_src
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
ggml_tallocr_t cur_allocr = node_allocr(node);
|
int cur_backend_id = tensor_backend_id(node);
|
||||||
if (node->view_src != NULL && cur_allocr == NULL) {
|
if (node->view_src != NULL && cur_backend_id == -1) {
|
||||||
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
|
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
||||||
SET_CAUSE(node, "3.vsrc");
|
SET_CAUSE(node, "3.vsrc");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
@ -1257,14 +1257,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_allocr == NULL) {
|
if (src_backend_id == -1) {
|
||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
// views are always on the same backend as the source
|
// views are always on the same backend as the source
|
||||||
node_allocr(src) = node_allocr(src->view_src);
|
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
||||||
SET_CAUSE(src, "3.vsrc");
|
SET_CAUSE(src, "3.vsrc");
|
||||||
} else {
|
} else {
|
||||||
node_allocr(src) = cur_allocr;
|
tensor_backend_id(src) = cur_backend_id;
|
||||||
SET_CAUSE(src, "3.cur");
|
SET_CAUSE(src, "3.cur");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1281,15 +1281,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (!ggml_is_view_op(node->op)) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
sched->splits[0].tallocr = node_allocr(node);
|
sched->splits[0].backend_id = tensor_backend_id(node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[0].i_start = 0;
|
sched->splits[0].i_start = 0;
|
||||||
sched->splits[0].n_inputs = 0;
|
sched->splits[0].n_inputs = 0;
|
||||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
||||||
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
int cur_backend_id = sched->splits[0].backend_id;
|
||||||
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
@ -1297,19 +1296,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
int tensor_backend_id = tensor_backend_id(node);
|
||||||
|
|
||||||
GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
|
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
if (node_allocr != cur_allocr) {
|
if (tensor_backend_id != cur_backend_id) {
|
||||||
sched->splits[cur_split].i_end = i;
|
sched->splits[cur_split].i_end = i;
|
||||||
cur_split++;
|
cur_split++;
|
||||||
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
||||||
sched->splits[cur_split].tallocr = node_allocr;
|
sched->splits[cur_split].backend_id = tensor_backend_id;
|
||||||
sched->splits[cur_split].i_start = i;
|
sched->splits[cur_split].i_start = i;
|
||||||
sched->splits[cur_split].n_inputs = 0;
|
sched->splits[cur_split].n_inputs = 0;
|
||||||
cur_allocr = node_allocr;
|
cur_backend_id = tensor_backend_id;
|
||||||
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// find inputs that are not on the same backend
|
// find inputs that are not on the same backend
|
||||||
|
@ -1318,43 +1316,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
if (src_allocr != node_allocr) {
|
if (src_backend_id != tensor_backend_id) {
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
size_t id = hash_id(src);
|
size_t id = hash_id(src);
|
||||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
if (sched->tensor_copies[id][cur_backend_id] == NULL) {
|
||||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||||
|
|
||||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
sched->tensor_copies[id][cur_backend_id] = tensor_copy;
|
||||||
node_allocr(tensor_copy) = cur_allocr;
|
tensor_backend_id(tensor_copy) = cur_backend_id;
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
|
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
node->src[j] = sched->tensor_copies[id][cur_backend_id];
|
||||||
|
|
||||||
#if 0
|
|
||||||
// check if the input is already in the split
|
|
||||||
bool found = false;
|
|
||||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
|
||||||
if (sched->splits[cur_split].inputs[k] == src) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
|
||||||
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
|
||||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1369,30 +1349,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
// sanity check: all sources should have the same backend as the node
|
// sanity check: all sources should have the same backend as the node
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
ggml_backend_t tensor_backend = tensor_backend(node);
|
||||||
if (node_allocr == NULL) {
|
if (tensor_backend == NULL) {
|
||||||
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
||||||
}
|
}
|
||||||
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
|
if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
|
||||||
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
||||||
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
||||||
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
|
node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
ggml_backend_t src_backend = tensor_backend(src);
|
||||||
if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
||||||
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
||||||
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
||||||
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
||||||
}
|
}
|
||||||
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
|
if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
|
||||||
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
||||||
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
|
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
||||||
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
|
src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1406,32 +1386,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
|
||||||
|
|
||||||
// add a dependency to the input source so that it is not freed before the copy is done
|
// add a dependency to the input source so that it is not freed before the copy is done
|
||||||
GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||||
input_cpy->src[0] = input;
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
||||||
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||||
|
|
||||||
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||||
|
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = split->i_start; j < split->i_end; j++) {
|
for (int j = split->i_start; j < split->i_end; j++) {
|
||||||
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->graph = graph_copy;
|
sched->graph = graph_copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sched_alloc_splits(ggml_backend_sched_t sched) {
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
ggml_gallocr_alloc_graph_n(
|
// ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
||||||
sched->galloc,
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||||
sched->graph,
|
#ifndef NDEBUG
|
||||||
sched->hash_set,
|
fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
|
||||||
sched->node_talloc);
|
#endif
|
||||||
|
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
|
||||||
|
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||||
|
fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sched_compute_splits(ggml_backend_sched_t sched) {
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
||||||
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
||||||
|
|
||||||
|
@ -1439,20 +1432,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &splits[i];
|
struct ggml_backend_sched_split * split = &splits[i];
|
||||||
ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
|
int split_backend_id = split->backend_id;
|
||||||
int split_backend_id = sched_backend_prio(sched, split_backend);
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||||
|
|
||||||
// copy the input tensors to the split backend
|
// copy the input tensors to the split backend
|
||||||
uint64_t copy_start_us = ggml_time_us();
|
uint64_t copy_start_us = ggml_time_us();
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
|
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
|
||||||
|
|
||||||
GGML_ASSERT(input->buffer != NULL);
|
GGML_ASSERT(input->buffer != NULL);
|
||||||
GGML_ASSERT(input_cpy->buffer != NULL);
|
GGML_ASSERT(input_cpy->buffer != NULL);
|
||||||
|
|
||||||
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
|
||||||
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
|
||||||
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
||||||
}
|
}
|
||||||
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
||||||
|
@ -1468,7 +1459,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
|
||||||
uint64_t compute_start_us = ggml_time_us();
|
uint64_t compute_start_us = ggml_time_us();
|
||||||
if (!sched->callback_eval) {
|
if (!sched->callback_eval) {
|
||||||
ggml_backend_graph_compute(split_backend, &split->graph);
|
if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
||||||
} else {
|
} else {
|
||||||
// similar to ggml_backend_compare_graph_backend
|
// similar to ggml_backend_compare_graph_backend
|
||||||
|
@ -1488,7 +1481,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
|
||||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||||
|
|
||||||
ggml_backend_graph_compute(split_backend, &gv);
|
if (!ggml_backend_graph_compute(split_backend, &gv)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
||||||
break;
|
break;
|
||||||
|
@ -1510,19 +1505,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
|
|
||||||
static void sched_reset(ggml_backend_sched_t sched) {
|
return true;
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
ggml_tallocr_reset(sched->tallocs[i]);
|
|
||||||
}
|
|
||||||
// reset state for the next run
|
|
||||||
size_t hash_size = sched->hash_set.size;
|
|
||||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
|
||||||
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
|
||||||
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
|
||||||
|
|
||||||
sched->is_reset = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
||||||
|
@ -1533,8 +1517,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
for (int i = 0; i < n_backends; i++) {
|
for (int i = 0; i < n_backends; i++) {
|
||||||
|
@ -1542,14 +1527,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back
|
||||||
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
sched->galloc = ggml_gallocr_new();
|
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||||
|
|
||||||
// init measure allocs for each backend
|
ggml_backend_sched_reset(sched);
|
||||||
for (int i = 0; i < n_backends; i++) {
|
|
||||||
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
sched_reset(sched);
|
|
||||||
|
|
||||||
return sched;
|
return sched;
|
||||||
}
|
}
|
||||||
|
@ -1558,49 +1538,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
if (sched == NULL) {
|
if (sched == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
ggml_tallocr_free(sched->tallocs[i]);
|
|
||||||
}
|
|
||||||
ggml_gallocr_free(sched->galloc);
|
ggml_gallocr_free(sched->galloc);
|
||||||
ggml_free(sched->ctx);
|
ggml_free(sched->ctx);
|
||||||
free(sched->hash_set.keys);
|
free(sched->hash_set.keys);
|
||||||
free(sched->node_talloc);
|
free(sched->tensor_backend_id);
|
||||||
free(sched->node_copies);
|
free(sched->tensor_copies);
|
||||||
|
free(sched->node_backend_ids);
|
||||||
free(sched);
|
free(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
// reset state for the next run
|
||||||
|
size_t hash_size = sched->hash_set.size;
|
||||||
|
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||||
|
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||||
|
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||||
|
|
||||||
sched_split_graph(sched, measure_graph);
|
sched->is_reset = true;
|
||||||
sched_alloc_splits(sched);
|
|
||||||
|
|
||||||
// allocate buffers and reset allocators
|
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
|
||||||
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
|
||||||
ggml_tallocr_free(sched->tallocs[i]);
|
|
||||||
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sched_reset(sched);
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
|
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
ggml_backend_sched_reset(sched);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
if (!sched->is_reset) {
|
if (!sched->is_reset) {
|
||||||
sched_reset(sched);
|
ggml_backend_sched_reset(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
sched_alloc_splits(sched);
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||||
sched_compute_splits(sched);
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
if (!ggml_backend_sched_compute_splits(sched)) {
|
||||||
sched_reset(sched);
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||||
sched->callback_eval = callback;
|
sched->callback_eval = callback;
|
||||||
|
@ -1611,37 +1596,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||||
return sched->n_splits;
|
return sched->n_splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
int backend_index = sched_backend_prio(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
return sched->tallocs[backend_index];
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
||||||
int backend_index = sched_backend_prio(sched, backend);
|
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
||||||
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
int backend_index = sched_backend_prio(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
node_allocr(node) = sched->tallocs[backend_index];
|
tensor_backend_id(node) = backend_index;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
ggml_tallocr_t allocr = node_allocr(node);
|
int backend_index = tensor_backend_id(node);
|
||||||
if (allocr == NULL) {
|
if (backend_index == -1) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return get_allocr_backend(sched, allocr);
|
return sched->backends[backend_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
@ -1665,7 +1643,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
|
||||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
||||||
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
||||||
|
|
||||||
GGML_ASSERT(src != NULL);
|
GGML_ASSERT(src != NULL);
|
||||||
|
@ -1678,7 +1656,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
||||||
|
|
||||||
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
||||||
dst->view_offs = src->view_offs;
|
dst->view_offs = src->view_offs;
|
||||||
}
|
}
|
||||||
dst->op = src->op;
|
dst->op = src->op;
|
||||||
|
@ -1691,14 +1669,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru
|
||||||
if (s == NULL) {
|
if (s == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
||||||
}
|
}
|
||||||
|
|
||||||
node_copies[id] = dst;
|
node_copies[id] = dst;
|
||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||||
size_t id = ggml_hash_find(hash_set, src);
|
size_t id = ggml_hash_find(hash_set, src);
|
||||||
if (node_init[id]) {
|
if (node_init[id]) {
|
||||||
return;
|
return;
|
||||||
|
@ -1707,7 +1685,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
||||||
|
|
||||||
struct ggml_tensor * dst = node_copies[id];
|
struct ggml_tensor * dst = node_copies[id];
|
||||||
if (dst->view_src != NULL) {
|
if (dst->view_src != NULL) {
|
||||||
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -1720,17 +1698,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
||||||
if (s == NULL) {
|
if (s == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
graph_init_tensor(hash_set, node_copies, node_init, s);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
struct ggml_hash_set hash_set = {
|
struct ggml_hash_set hash_set = {
|
||||||
/* .size = */ graph->visited_hash_table.size,
|
/* .size = */ graph->visited_hash_table.size,
|
||||||
/* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
|
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
||||||
};
|
};
|
||||||
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
|
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
||||||
bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
|
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
||||||
|
@ -1759,7 +1737,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
// dup nodes
|
// dup nodes
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate nodes
|
// allocate nodes
|
||||||
|
@ -1784,7 +1762,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
// copy data and init views
|
// copy data and init views
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
graph_init_tensor(hash_set, node_copies, node_init, node);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// build graph copy
|
// build graph copy
|
||||||
|
|
|
@ -130,11 +130,7 @@ extern "C" {
|
||||||
|
|
||||||
// in build_graph:
|
// in build_graph:
|
||||||
build_graph(...) {
|
build_graph(...) {
|
||||||
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
||||||
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
|
||||||
ggml_allocr_alloc(alloc_cpu, tensor);
|
|
||||||
|
|
||||||
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
|
||||||
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
||||||
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
||||||
}
|
}
|
||||||
|
@ -164,20 +160,19 @@ extern "C" {
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||||
// Get the number of splits of the last graph
|
// Get the number of splits of the last graph
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
// Reset all assignments and allocators - must be called before changing the node backends
|
||||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Set a callback to be called for each resulting node during graph compute
|
// Set a callback to be called for each resulting node during graph compute
|
||||||
|
|
223
ggml-cuda.cu
223
ggml-cuda.cu
|
@ -150,8 +150,8 @@
|
||||||
#define CUDA_USE_TENSOR_CORES
|
#define CUDA_USE_TENSOR_CORES
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// max batch size to use MMQ kernels when tensor cores are available
|
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
||||||
#define MMQ_MAX_BATCH_SIZE 32
|
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
@ -5310,51 +5310,59 @@ template <bool need_check> static __global__ void
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MMVQ_NWARPS_NVIDIA 4
|
template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
#define MMVQ_NWARPS_AMD_RDNA2 1
|
|
||||||
#define MMVQ_NWARPS_AMD_OLD 4
|
|
||||||
|
|
||||||
template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
|
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||||
|
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||||
|
constexpr int nwarps = 1;
|
||||||
|
constexpr int rows_per_cuda_block = 1;
|
||||||
|
#else
|
||||||
|
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||||
|
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||||
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
const int row = blockIdx.x;
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||||
|
|
||||||
const int blocks_per_row_x = ncols_x / qk;
|
const int blocks_per_row_x = ncols_x / qk;
|
||||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||||
const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
||||||
|
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
|
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||||
|
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
|
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||||
const int ibx = row*blocks_per_row_x + i; // x block index
|
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
||||||
|
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
// x block quant index when casting the quants to int
|
||||||
|
const int kqs = vdr * (tid % (qi/vdr));
|
||||||
const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
|
#pragma unroll
|
||||||
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
|
tmp[j][i] += vec_dot_q_cuda(
|
||||||
|
&x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
|
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
||||||
if (threadIdx.y > 0) {
|
if (threadIdx.y > 0) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
|
#pragma unroll
|
||||||
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
|
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -5366,13 +5374,16 @@ static __global__ void mul_mat_vec_q(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < nwarps-1; ++i) {
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
tmp[j] += tmp_shared[i][j][threadIdx.x];
|
#pragma unroll
|
||||||
|
for (int l = 0; l < nwarps-1; ++l) {
|
||||||
|
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
||||||
|
}
|
||||||
|
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
||||||
}
|
}
|
||||||
tmp[j] = warp_reduce_sum(tmp[j]);
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x < rows_per_cuda_block) {
|
||||||
dst[j*nrows_dst + row] = tmp[j];
|
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6851,65 +6862,75 @@ static void mul_mat_vec_q_cuda(
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
GGML_ASSERT(ncols_x % qk == 0);
|
GGML_ASSERT(ncols_x % qk == 0);
|
||||||
GGML_ASSERT(ncols_y <= 4);
|
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
int nwarps;
|
int64_t nwarps = 1;
|
||||||
if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
|
int64_t rows_per_cuda_block = 1;
|
||||||
nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
|
|
||||||
} else {
|
|
||||||
nwarps = MMVQ_NWARPS_NVIDIA;
|
|
||||||
}
|
|
||||||
|
|
||||||
const dim3 block_nums(nrows_x, 1, 1);
|
if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||||
|
switch(ncols_y) {
|
||||||
|
case 1:
|
||||||
|
nwarps = 4;
|
||||||
|
rows_per_cuda_block = 1;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
case 4:
|
||||||
|
nwarps = 4;
|
||||||
|
rows_per_cuda_block = 2;
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
nwarps = 2;
|
||||||
|
rows_per_cuda_block = 2;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
||||||
|
const dim3 block_nums(nblocks, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||||
|
|
||||||
switch (nwarps) {
|
switch (ncols_y) {
|
||||||
case 1: switch(ncols_y) {
|
|
||||||
case 1:
|
case 1:
|
||||||
mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
default:
|
case 5:
|
||||||
GGML_ASSERT(false);
|
mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
} break;
|
case 6:
|
||||||
case 4: switch(ncols_y) {
|
mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
case 1:
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
|
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 7:
|
||||||
mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 8:
|
||||||
mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 4:
|
|
||||||
mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
|
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
} break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
@ -9735,7 +9756,7 @@ static __global__ void k_compute_batched_ptrs(
|
||||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
|
@ -9893,39 +9914,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
|
|
||||||
|
bool any_pascal_with_slow_fp16 = false;
|
||||||
if (split) {
|
if (split) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
auto & tensor_split = buft_ctx->tensor_split;
|
auto & tensor_split = buft_ctx->tensor_split;
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
// skip devices that are not going to do any work:
|
||||||
|
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_compute_capability > g_device_caps[id].cc) {
|
||||||
min_compute_capability = g_device_caps[id].cc;
|
min_compute_capability = g_device_caps[id].cc;
|
||||||
}
|
}
|
||||||
|
if (g_device_caps[id].cc == 610) {
|
||||||
|
any_pascal_with_slow_fp16 = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
min_compute_capability = g_device_caps[g_main_device].cc;
|
min_compute_capability = g_device_caps[g_main_device].cc;
|
||||||
|
any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||||
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
|
||||||
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
|
||||||
|
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
||||||
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
#endif // CUDA_USE_TENSOR_CORES
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
|
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
||||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
||||||
|
|
||||||
|
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
||||||
|
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
||||||
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
// when tensor cores are available, use them for large batch size
|
// when tensor cores are available, use them for large batch size
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||||
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
#endif // CUDA_USE_TENSOR_CORES
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
#endif // GGML_CUDA_FORCE_DMMV
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||||
|
@ -9943,24 +9994,10 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
|
||||||
#ifdef GGML_CUDA_FORCE_DMMV
|
|
||||||
const bool use_mul_mat_vec_q = false;
|
|
||||||
#else
|
|
||||||
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
|
||||||
#endif // GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
if (use_mul_mat_vec_q) {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
|
||||||
} else {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||||
}
|
} else if (use_mul_mat_vec_q) {
|
||||||
} else {
|
|
||||||
if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
||||||
} else if (use_mul_mat_q) {
|
} else if (use_mul_mat_q) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||||
|
@ -9968,10 +10005,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
template<typename ... Srcs>
|
template<typename ... Srcs>
|
||||||
|
|
368
ggml-quants.c
368
ggml-quants.c
|
@ -49,6 +49,8 @@
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||||
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
@ -3677,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_0 * restrict x = vx;
|
const block_q4_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_0 * restrict vx0 = vx;
|
||||||
|
const block_q4_0 * restrict vx1 = vx + bx;
|
||||||
|
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
|
||||||
|
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
|
||||||
|
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
|
||||||
|
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -3740,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
|
|
||||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||||
const __m256i off = _mm256_set1_epi8( 8 );
|
const __m256i off = _mm256_set1_epi8( 8 );
|
||||||
bx = _mm256_sub_epi8( bx, off );
|
qx = _mm256_sub_epi8( qx, off );
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps( d, q, acc );
|
acc = _mm256_fmadd_ps( d, q, acc );
|
||||||
|
@ -3967,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_1 * restrict x = vx;
|
const block_q4_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_1 * restrict vx0 = vx;
|
||||||
|
const block_q4_1 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_1 * restrict vy0 = vy;
|
||||||
|
const block_q8_1 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_1 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_1 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
||||||
|
summs0 += summs_t;
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
// mmla into int32x4_t
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
sumv2 = sumv2 + summs0;
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// TODO: add WASM SIMD
|
// TODO: add WASM SIMD
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4039,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
||||||
|
|
||||||
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
||||||
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
|
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
||||||
|
|
||||||
const __m256 xy = mul_sum_us8_pairs_float(bx, by);
|
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Accumulate d0*d1*x*y
|
// Accumulate d0*d1*x*y
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
@ -4107,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_0);
|
assert(qk == QK5_0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_0 * restrict x = vx;
|
const block_q5_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
@ -4256,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps(d, q, acc);
|
acc = _mm256_fmadd_ps(d, q, acc);
|
||||||
|
@ -4393,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_1);
|
assert(qk == QK5_1);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_1 * restrict x = vx;
|
const block_q5_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
@ -4555,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
const __m256 dy = _mm256_set1_ps(y[i].d);
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
||||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
||||||
}
|
}
|
||||||
|
@ -4692,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q8_0 * restrict x = vx;
|
const block_q8_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q8_0 * restrict vx0 = vx;
|
||||||
|
const block_q8_0 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q8_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
|
||||||
|
const block_q8_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
||||||
|
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
||||||
|
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
|
||||||
|
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4742,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
// Compute combined scale for the block
|
// Compute combined scale for the block
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Multiply q with scale and accumulate
|
// Multiply q with scale and accumulate
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
@ -4795,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5171,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5429,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const uint32_t kmask1 = 0x03030303;
|
const uint32_t kmask1 = 0x03030303;
|
||||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
@ -5949,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q3_K * restrict x = vx;
|
const block_q3_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6292,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6648,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6891,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7311,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7577,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8009,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8339,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
|
||||||
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xxs * restrict x = vx;
|
const block_iq2_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8462,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xs * restrict x = vx;
|
const block_iq2_xs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8682,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq3_xxs * restrict x = vx;
|
const block_iq3_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
|
@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
|
|
@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||||
}
|
}
|
||||||
char * dst_ptr = (char *) dst;
|
char * dst_ptr = (char *) dst;
|
||||||
|
|
||||||
const int64_t ne0 = src->ne[0];
|
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
|
||||||
const int64_t nb0 = src->nb[0];
|
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
|
||||||
const int64_t nb1 = src->nb[1];
|
|
||||||
const int64_t nb2 = src->nb[2];
|
|
||||||
const int64_t nb3 = src->nb[3];
|
|
||||||
const enum ggml_type type = src->type;
|
const enum ggml_type type = src->type;
|
||||||
const int64_t ts = ggml_type_size(type);
|
const int64_t ts = ggml_type_size(type);
|
||||||
const int64_t bs = ggml_blck_size(type);
|
const int64_t bs = ggml_blck_size(type);
|
||||||
|
@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
ggml_sycl_op_mul_mat_t op,
|
ggml_sycl_op_mul_mat_t op,
|
||||||
const bool convert_src1_to_q8_1) try {
|
const bool convert_src1_to_q8_1) try {
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
const int64_t ne03 = src0->ne[3];
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||||
const int64_t ne11 = src1->ne[1];
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
|
||||||
const int64_t ne13 = src1->ne[3];
|
|
||||||
const int64_t nrows1 = ggml_nrows(src1);
|
const int64_t nrows1 = ggml_nrows(src1);
|
||||||
|
|
||||||
GGML_ASSERT(ne03 == ne13);
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
const int64_t ne03 = src0->ne[3];
|
|
||||||
|
|
||||||
const int64_t nb01 = src0->nb[1];
|
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
|
||||||
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
|
||||||
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||||
const int64_t ne11 = src1->ne[1];
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
|
||||||
const int64_t ne13 = src1->ne[3];
|
|
||||||
|
|
||||||
const int64_t nb11 = src1->nb[1];
|
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
||||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
|
||||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
|
||||||
|
|
||||||
const int64_t ne1 = ggml_nelements(src1);
|
const int64_t ne1 = ggml_nelements(src1);
|
||||||
const int64_t ne = ggml_nelements(dst);
|
const int64_t ne = ggml_nelements(dst);
|
||||||
|
@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
||||||
const int64_t ne01 = src00->ne[1];
|
|
||||||
const int64_t ne02 = src00->ne[2];
|
|
||||||
const int64_t ne03 = src00->ne[3];
|
|
||||||
|
|
||||||
//const int64_t nb01 = src00->nb[1];
|
//const int64_t nb01 = src00->nb[1];
|
||||||
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
|
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
|
||||||
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||||
const int64_t ne11 = src1->ne[1];
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
|
||||||
const int64_t ne13 = src1->ne[3];
|
|
||||||
|
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
||||||
//const int64_t nb11 = src1->nb[1];
|
//const int64_t nb11 = src1->nb[1];
|
||||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
|
||||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
|
||||||
|
|
||||||
const int64_t ne1 = ggml_nelements(src1);
|
const int64_t ne1 = ggml_nelements(src1);
|
||||||
const int64_t ne = ggml_nelements(dst);
|
const int64_t ne = ggml_nelements(dst);
|
||||||
|
@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
|
|
||||||
|
|
||||||
const int64_t nb00 = src0->nb[0];
|
|
||||||
const int64_t nb01 = src0->nb[1];
|
|
||||||
const int64_t nb02 = src0->nb[2];
|
|
||||||
const int64_t nb03 = src0->nb[3];
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
|
||||||
const int64_t ne11 = src1->ne[1];
|
|
||||||
const int64_t ne12 = src1->ne[2];
|
|
||||||
|
|
||||||
|
|
||||||
const int64_t nb10 = src1->nb[0];
|
|
||||||
const int64_t nb11 = src1->nb[1];
|
|
||||||
const int64_t nb12 = src1->nb[2];
|
|
||||||
const int64_t nb13 = src1->nb[3];
|
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||||
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
||||||
|
|
103
ggml-vulkan.cpp
103
ggml-vulkan.cpp
|
@ -27,6 +27,7 @@
|
||||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||||
|
|
||||||
#define VK_VENDOR_ID_AMD 0x1002
|
#define VK_VENDOR_ID_AMD 0x1002
|
||||||
|
#define VK_VENDOR_ID_APPLE 0x106b
|
||||||
#define VK_VENDOR_ID_INTEL 0x8086
|
#define VK_VENDOR_ID_INTEL 0x8086
|
||||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||||
|
|
||||||
|
@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
||||||
return ctx->pipeline_matmul_f32_aligned_l.align;
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
|
||||||
#endif
|
|
||||||
if (bit16_x && bit16_y) {
|
if (bit16_x && bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
||||||
|
#endif
|
||||||
|
switch (ctx->device.lock()->vendor_id) {
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
||||||
|
case VK_VENDOR_ID_APPLE:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
||||||
}
|
}
|
||||||
if (bit16_x && !bit16_y) {
|
if (bit16_x && !bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
192
ggml.c
192
ggml.c
|
@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
|
||||||
|
|
||||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
|
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||||
.vec_dot_type = GGML_TYPE_F32,
|
.vec_dot_type = GGML_TYPE_F32,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_F16] = {
|
[GGML_TYPE_F16] = {
|
||||||
.type_name = "f16",
|
.type_name = "f16",
|
||||||
|
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0] = {
|
[GGML_TYPE_Q4_0] = {
|
||||||
.type_name = "q4_0",
|
.type_name = "q4_0",
|
||||||
|
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_1] = {
|
[GGML_TYPE_Q4_1] = {
|
||||||
.type_name = "q4_1",
|
.type_name = "q4_1",
|
||||||
|
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[4] = { // GGML_TYPE_Q4_2
|
[4] = { // GGML_TYPE_Q4_2
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[5] = { // GGML_TYPE_Q4_3
|
[5] = { // GGML_TYPE_Q4_3
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_0] = {
|
[GGML_TYPE_Q5_0] = {
|
||||||
.type_name = "q5_0",
|
.type_name = "q5_0",
|
||||||
|
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_1] = {
|
[GGML_TYPE_Q5_1] = {
|
||||||
.type_name = "q5_1",
|
.type_name = "q5_1",
|
||||||
|
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_0] = {
|
[GGML_TYPE_Q8_0] = {
|
||||||
.type_name = "q8_0",
|
.type_name = "q8_0",
|
||||||
|
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
.type_name = "q8_1",
|
.type_name = "q8_1",
|
||||||
|
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float = quantize_row_q8_1,
|
.from_float = quantize_row_q8_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
.type_name = "q2_K",
|
.type_name = "q2_K",
|
||||||
|
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
.type_name = "q3_K",
|
.type_name = "q3_K",
|
||||||
|
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
.type_name = "q4_K",
|
.type_name = "q4_K",
|
||||||
|
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
.type_name = "q5_K",
|
.type_name = "q5_K",
|
||||||
|
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
.type_name = "q6_K",
|
.type_name = "q6_K",
|
||||||
|
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XXS] = {
|
[GGML_TYPE_IQ2_XXS] = {
|
||||||
.type_name = "iq2_xxs",
|
.type_name = "iq2_xxs",
|
||||||
|
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XS] = {
|
[GGML_TYPE_IQ2_XS] = {
|
||||||
.type_name = "iq2_xs",
|
.type_name = "iq2_xs",
|
||||||
|
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ3_XXS] = {
|
[GGML_TYPE_IQ3_XXS] = {
|
||||||
.type_name = "iq3_xxs",
|
.type_name = "iq3_xxs",
|
||||||
|
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
.type_name = "q8_K",
|
.type_name = "q8_K",
|
||||||
|
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
||||||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
#ifdef GGML_SIMD
|
#ifdef GGML_SIMD
|
||||||
float sumf = 0.0f;
|
float sumf = 0.0f;
|
||||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||||
|
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
ggml_float sumf = 0.0;
|
ggml_float sumf = 0.0;
|
||||||
|
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD)
|
||||||
|
@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||||
|
@ -2607,7 +2649,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.nb =*/ { 0, 0, 0, 0 },
|
/*.nb =*/ { 0, 0, 0, 0 },
|
||||||
/*.op =*/ GGML_OP_NONE,
|
/*.op =*/ GGML_OP_NONE,
|
||||||
/*.op_params =*/ { 0 },
|
/*.op_params =*/ { 0 },
|
||||||
/*.is_param =*/ false,
|
/*.flags =*/ 0,
|
||||||
/*.grad =*/ NULL,
|
/*.grad =*/ NULL,
|
||||||
/*.src =*/ { NULL },
|
/*.src =*/ { NULL },
|
||||||
/*.perf_runs =*/ 0,
|
/*.perf_runs =*/ 0,
|
||||||
|
@ -6509,7 +6551,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
||||||
void ggml_set_param(
|
void ggml_set_param(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor) {
|
struct ggml_tensor * tensor) {
|
||||||
tensor->is_param = true;
|
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->grad == NULL);
|
GGML_ASSERT(tensor->grad == NULL);
|
||||||
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
||||||
|
@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||||
|
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne01);
|
GGML_ASSERT(ne0 == ne01);
|
||||||
GGML_ASSERT(ne1 == ne11);
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int64_t blck_0 = 16;
|
const int64_t blck_0 = 16;
|
||||||
const int64_t blck_1 = 16;
|
const int64_t blck_1 = 16;
|
||||||
|
|
||||||
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||||
|
int64_t nrc = vec_dot_num_rows;
|
||||||
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||||
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||||
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
||||||
|
nrc = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||||
|
|
||||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||||
float tmp[16];
|
// 16 * 2, accounting for mmla kernels
|
||||||
|
float tmp[32];
|
||||||
|
|
||||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
||||||
const int64_t i13 = (ir1/(ne12*ne1));
|
const int64_t i13 = (ir1/(ne12*ne1));
|
||||||
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||||
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||||
|
@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
|
||||||
(src1_cont || src1->type != vec_dot_type
|
(src1_cont || src1->type != vec_dot_type
|
||||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
|
||||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||||
|
|
||||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int cn = 0; cn < nrc; ++cn) {
|
||||||
|
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
|
@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
||||||
|
|
||||||
// linear runtime, no additional memory
|
// linear runtime, no additional memory
|
||||||
float dot_y_dy = 0;
|
float dot_y_dy = 0;
|
||||||
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
|
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||||
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
||||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||||
|
@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne02, &v,
|
ggml_vec_dot_f16(ne02, &v, 0,
|
||||||
(ggml_fp16_t *) wdata_src + i1n,
|
(ggml_fp16_t *) wdata_src + i1n, 0,
|
||||||
(ggml_fp16_t *) wdata_kernel + i00*ne02);
|
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f32(ne02, &v,
|
ggml_vec_dot_f32(ne02, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i00*ne02);
|
wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
||||||
for (int i01 = 0; i01 < ne01; i01++) {
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne03, &v,
|
ggml_vec_dot_f16(ne03, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f32(masked_begin,
|
ggml_vec_dot_f32(masked_begin,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S);
|
S, 0, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f16(neq0,
|
ggml_vec_dot_f16(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nev0,
|
ggml_vec_dot_f16(nev0,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
const int i1 = ib01;
|
const int i1 = ib01;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nea0,
|
ggml_vec_dot_f16(nea0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
|
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
|
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||||
|
@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||||
|
|
||||||
ggml_vec_dot_f16(neb01,
|
ggml_vec_dot_f16(neb01,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
|
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(nec01,
|
ggml_vec_add_f32(nec01,
|
||||||
|
@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
|
|
||||||
// S = SM * (S - dot(SM, S))
|
// S = SM * (S - dot(SM, S))
|
||||||
float dot_SM_gradSM = 0;
|
float dot_SM_gradSM = 0;
|
||||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
|
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
||||||
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
||||||
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
||||||
|
|
||||||
|
@ -15311,7 +15367,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->is_param) {
|
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15345,7 +15401,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
||||||
|
|
||||||
clone->op = node->op;
|
clone->op = node->op;
|
||||||
clone->grad = node->grad;
|
clone->grad = node->grad;
|
||||||
clone->is_param = node->is_param;
|
clone->flags = node->flags;
|
||||||
clone->extra = node->extra;
|
clone->extra = node->extra;
|
||||||
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
||||||
clone->nb[k] = node->nb[k];
|
clone->nb[k] = node->nb[k];
|
||||||
|
@ -16377,7 +16433,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
if (node->is_param) {
|
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||||
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
||||||
ggml_build_forward_expand(gb, node->grad);
|
ggml_build_forward_expand(gb, node->grad);
|
||||||
}
|
}
|
||||||
|
@ -17862,7 +17918,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
||||||
i,
|
i,
|
||||||
node->ne[0], node->ne[1], node->ne[2],
|
node->ne[0], node->ne[1], node->ne[2],
|
||||||
ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
||||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
||||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
||||||
(double) node->perf_time_us / 1000.0,
|
(double) node->perf_time_us / 1000.0,
|
||||||
|
@ -17955,7 +18011,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->is_param) {
|
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||||
snprintf(color, sizeof(color), "yellow");
|
snprintf(color, sizeof(color), "yellow");
|
||||||
} else if (node->grad) {
|
} else if (node->grad) {
|
||||||
if (ggml_graph_find(gf, node)) {
|
if (ggml_graph_find(gf, node)) {
|
||||||
|
@ -18129,7 +18185,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
int np = 0;
|
int np = 0;
|
||||||
int64_t nx = 0;
|
int64_t nx = 0;
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
if (gf->nodes[i]->is_param) {
|
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||||
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
||||||
|
|
||||||
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
||||||
|
@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the initial gradient in the search direction
|
// compute the initial gradient in the search direction
|
||||||
ggml_vec_dot_f32(nx, &dginit, g, d);
|
ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// make sure that d points to a descent direction
|
// make sure that d points to a descent direction
|
||||||
if (0 < dginit) {
|
if (0 < dginit) {
|
||||||
|
@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_dot_f32(nx, &dg, g, d);
|
ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// check the Wolfe condition
|
// check the Wolfe condition
|
||||||
if (dg < params->lbfgs.wolfe * dginit) {
|
if (dg < params->lbfgs.wolfe * dginit) {
|
||||||
|
@ -18492,7 +18548,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
int np = 0;
|
int np = 0;
|
||||||
int nx = 0;
|
int nx = 0;
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
if (gf->nodes[i]->is_param) {
|
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||||
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
||||||
|
|
||||||
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
||||||
|
@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
// ys = y^t \cdot s -> 1 / \rho.
|
// ys = y^t \cdot s -> 1 / \rho.
|
||||||
// yy = y^t \cdot y.
|
// yy = y^t \cdot y.
|
||||||
//
|
//
|
||||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
|
||||||
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
|
||||||
|
|
||||||
lm_ys[end[0]] = ys;
|
lm_ys[end[0]] = ys;
|
||||||
|
|
||||||
|
@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
j[0] = (j[0] + m - 1) % m;
|
j[0] = (j[0] + m - 1) % m;
|
||||||
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
||||||
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
|
||||||
lm_alpha[j[0]] /= lm_ys[j[0]];
|
lm_alpha[j[0]] /= lm_ys[j[0]];
|
||||||
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
||||||
|
@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
|
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
||||||
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
|
||||||
beta /= lm_ys[j[0]];
|
beta /= lm_ys[j[0]];
|
||||||
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
||||||
|
@ -18967,6 +19023,16 @@ enum ggml_opt_result ggml_opt_resume_g(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ggml_set_input(struct ggml_tensor * tensor) {
|
||||||
|
tensor->flags |= GGML_TENSOR_FLAG_INPUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_set_output(struct ggml_tensor * tensor) {
|
||||||
|
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void ggml_quantize_init(enum ggml_type type) {
|
void ggml_quantize_init(enum ggml_type type) {
|
||||||
ggml_critical_section_start();
|
ggml_critical_section_start();
|
||||||
|
|
||||||
|
@ -20611,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_matmul_int8(void) {
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
19
ggml.h
19
ggml.h
|
@ -510,6 +510,12 @@ extern "C" {
|
||||||
GGML_LOG_LEVEL_DEBUG = 5
|
GGML_LOG_LEVEL_DEBUG = 5
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ggml_tensor_flag {
|
||||||
|
GGML_TENSOR_FLAG_INPUT = 1,
|
||||||
|
GGML_TENSOR_FLAG_OUTPUT = 2,
|
||||||
|
GGML_TENSOR_FLAG_PARAM = 4,
|
||||||
|
};
|
||||||
|
|
||||||
// ggml object
|
// ggml object
|
||||||
struct ggml_object {
|
struct ggml_object {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
|
@ -543,7 +549,7 @@ extern "C" {
|
||||||
// op params - allocated as int32_t for alignment
|
// op params - allocated as int32_t for alignment
|
||||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||||
|
|
||||||
bool is_param;
|
int32_t flags;
|
||||||
|
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
@ -2092,6 +2098,12 @@ extern "C" {
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback callback,
|
||||||
void * callback_data);
|
void * callback_data);
|
||||||
|
|
||||||
|
//
|
||||||
|
// tensor flags
|
||||||
|
//
|
||||||
|
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
@ -2278,6 +2290,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
@ -2291,7 +2304,8 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
|
@ -2303,6 +2317,7 @@ extern "C" {
|
||||||
ggml_from_float_t from_float_reference;
|
ggml_from_float_t from_float_reference;
|
||||||
ggml_vec_dot_t vec_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
|
int64_t nrows; // number of rows to process simultaneously;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
45
gguf-py/examples/reader.py
Normal file
45
gguf-py/examples/reader.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from gguf.gguf_reader import GGUFReader
|
||||||
|
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
|
||||||
|
def read_gguf_file(gguf_file_path):
|
||||||
|
"""
|
||||||
|
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- gguf_file_path: Path to the GGUF file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
reader = GGUFReader(gguf_file_path)
|
||||||
|
|
||||||
|
# List all key-value pairs in a columnized format
|
||||||
|
print("Key-Value Pairs:")
|
||||||
|
max_key_length = max(len(key) for key in reader.fields.keys())
|
||||||
|
for key, field in reader.fields.items():
|
||||||
|
value = field.parts[field.data[0]]
|
||||||
|
print(f"{key:{max_key_length}} : {value}")
|
||||||
|
print("----")
|
||||||
|
|
||||||
|
# List all tensors
|
||||||
|
print("Tensors:")
|
||||||
|
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
|
||||||
|
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
|
||||||
|
print("-" * 80)
|
||||||
|
for tensor in reader.tensors:
|
||||||
|
shape_str = "x".join(map(str, tensor.shape))
|
||||||
|
size_str = str(tensor.n_elements)
|
||||||
|
quantization_str = tensor.tensor_type.name
|
||||||
|
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: reader.py <path_to_gguf_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
gguf_file_path = sys.argv[1]
|
||||||
|
read_gguf_file(gguf_file_path)
|
|
@ -40,6 +40,7 @@ class Keys:
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
|
POOLING_LAYER = "{arch}.pooling_layer"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -50,6 +51,7 @@ class Keys:
|
||||||
VALUE_LENGTH = "{arch}.attention.value_length"
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -63,6 +65,7 @@ class Keys:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
|
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
|
@ -95,6 +98,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
PERSIMMON = auto()
|
PERSIMMON = auto()
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
|
NOMIC_BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
|
@ -122,6 +126,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_OUT = auto()
|
ATTN_OUT = auto()
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
|
@ -134,6 +139,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
|
LAYER_OUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -148,6 +154,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
|
@ -178,6 +185,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
|
@ -187,6 +195,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -262,17 +271,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
],
|
],
|
||||||
MODEL_ARCH.BERT: [
|
MODEL_ARCH.BERT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
MODEL_TENSOR.TOKEN_TYPES,
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
MODEL_TENSOR.POS_EMBD,
|
MODEL_TENSOR.POS_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_NORM,
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_Q,
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.ATTN_K,
|
MODEL_TENSOR.ATTN_K,
|
||||||
MODEL_TENSOR.ATTN_V,
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.NOMIC_BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|
|
@ -357,6 +357,12 @@ class GGUFWriter:
|
||||||
def add_layer_norm_rms_eps(self, value: float) -> None:
|
def add_layer_norm_rms_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_pooling_layer(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_rope_dimension_count(self, count: int) -> None:
|
def add_rope_dimension_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -387,6 +393,9 @@ class GGUFWriter:
|
||||||
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
|
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
|
||||||
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
|
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
|
||||||
|
|
||||||
|
def add_token_type_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
|
||||||
|
|
||||||
def add_token_scores(self, scores: Sequence[float]) -> None:
|
def add_token_scores(self, scores: Sequence[float]) -> None:
|
||||||
self.add_array(Keys.Tokenizer.SCORES, scores)
|
self.add_array(Keys.Tokenizer.SCORES, scores)
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ class TensorNameMap:
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
"language_model.embedding.word_embeddings", # persimmon
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
"wte", # gpt2
|
"wte", # gpt2
|
||||||
"transformer.embd.wte", # phi2
|
"transformer.embd.wte", # phi2
|
||||||
|
@ -24,12 +24,14 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
"embeddings.token_type_embeddings", # bert
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Normalization of token embeddings
|
# Normalization of token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
"word_embeddings_layernorm", # bloom
|
"word_embeddings_layernorm", # bloom
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
|
"emb_ln", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
|
@ -54,7 +56,6 @@ class TensorNameMap:
|
||||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan internlm2
|
"model.norm", # llama-hf baichuan internlm2
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact bloom qwen gpt2
|
"ln_f", # refact bloom qwen gpt2
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
|
@ -79,7 +80,6 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
|
||||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln1", # yi
|
"model.layers.{bid}.ln1", # yi
|
||||||
"h.{bid}.ln_1", # gpt2
|
"h.{bid}.ln_1", # gpt2
|
||||||
|
@ -104,6 +104,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
||||||
"h.{bid}.attn.c_attn", # gpt2
|
"h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
|
@ -153,6 +154,13 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention output norm
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
@ -171,7 +179,6 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
|
||||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
"h.{bid}.ln_2", # gpt2
|
"h.{bid}.ln_2", # gpt2
|
||||||
|
@ -202,6 +209,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.fc1", # phi2
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
@ -221,6 +229,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
@ -246,6 +255,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.fc2", # phi2
|
"model.layers.{bid}.mlp.fc2", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
@ -266,6 +276,11 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -61,6 +61,7 @@ extern "C" {
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||||
|
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type {
|
enum llama_token_type {
|
||||||
|
@ -235,6 +236,7 @@ extern "C" {
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
@ -627,6 +629,10 @@ extern "C" {
|
||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_embd] (1-dimensional)
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Get the embeddings for the ith sequence
|
||||||
|
// llama_get_embeddings(ctx) + i*n_embd
|
||||||
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab
|
// Vocab
|
||||||
//
|
//
|
||||||
|
|
|
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
float fs;
|
float fs;
|
||||||
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
|
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
|
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||||
if (iloop > 3) ggml.addResult(fs, t);
|
if (iloop > 3) ggml.addResult(fs, t);
|
||||||
|
|
|
@ -284,8 +284,8 @@ int main(int argc, char** argv) {
|
||||||
else {
|
else {
|
||||||
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
||||||
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
||||||
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
|
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
|
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||||
}
|
}
|
||||||
sumq += result;
|
sumq += result;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
2c7cf49810d523b9632da393a9e8270b60bf3b24
|
5070f078a67c18c11736e78316ab715ca9afde16
|
||||||
|
|
1
spm-headers/ggml-alloc.h
Symbolic link
1
spm-headers/ggml-alloc.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-alloc.h
|
1
spm-headers/ggml-backend.h
Symbolic link
1
spm-headers/ggml-backend.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-backend.h
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml.h
|
|
@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_pad());
|
test_cases.emplace_back(new test_pad());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||||
|
#if 0
|
||||||
#if !defined(__SANITIZE_THREAD__)
|
#if !defined(__SANITIZE_THREAD__)
|
||||||
// FIXME: these tests use too much memory with thread sanitizer
|
// FIXME: these tests use too much memory with thread sanitizer
|
||||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
||||||
#if 0
|
|
||||||
test_cases.emplace_back(new test_llama(1));
|
test_cases.emplace_back(new test_llama(1));
|
||||||
test_cases.emplace_back(new test_llama(2));
|
test_cases.emplace_back(new test_llama(2));
|
||||||
test_cases.emplace_back(new test_falcon(1));
|
test_cases.emplace_back(new test_falcon(1));
|
||||||
|
|
|
@ -87,7 +87,7 @@ static float dot_product_error(
|
||||||
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
||||||
|
|
||||||
float result = INFINITY;
|
float result = INFINITY;
|
||||||
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||||
|
|
||||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||||
|
|
||||||
|
|
|
@ -346,7 +346,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
|
|
|
@ -4,13 +4,13 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
|
@ -74,45 +74,46 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (const std::invalid_argument &) {
|
catch (const std::invalid_argument &) {
|
||||||
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
//fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
// unicode
|
||||||
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
|
{
|
||||||
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
|
const int nthread = std::thread::hardware_concurrency();
|
||||||
std::string str = " " + codepoint_to_utf8(cp);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<std::thread> threads(nthread);
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
|
||||||
if (str != check) {
|
for (int i = 0; i < nthread; ++i) {
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
threads[i] = std::thread([i, nthread, ctx]() {
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||||
return 3;
|
if (!( // NOLINT
|
||||||
|
(cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
|
||||||
|
(cp < 0x13 || cp > 0x17) && cp != 0x19 &&
|
||||||
|
(cp < 0x1c || cp > 0x1e) &&
|
||||||
|
(cp < 0xd800 || cp > 0xdfff) &&
|
||||||
|
(cp < 0x00040000 || cp >= 0x000e0000)
|
||||||
|
)) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
// Restrict to assigned unicode planes
|
|
||||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
|
||||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
if (str != check) {
|
if (cp != 9601 && str != check) {
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
return 4;
|
std::exit(3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
});
|
||||||
std::string str = codepoint_to_utf8(cp);
|
}
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
for (auto & t : threads) {
|
||||||
if (str != check) {
|
t.join();
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
||||||
return 4;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
|
|
@ -4,13 +4,13 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
|
@ -72,26 +72,33 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
// unicode
|
||||||
if (cp < 0xd800 || cp > 0xdfff) {
|
{
|
||||||
|
const int nthread = std::thread::hardware_concurrency();
|
||||||
|
|
||||||
|
std::vector<std::thread> threads(nthread);
|
||||||
|
|
||||||
|
for (int i = 0; i < nthread; ++i) {
|
||||||
|
threads[i] = std::thread([i, nthread, ctx]() {
|
||||||
|
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||||
|
if (cp >= 0xd800 && cp <= 0xdfff) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
if (cp != 9601 && str != check) {
|
if (cp != 9601 && str != check) {
|
||||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
return 3;
|
std::exit(3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
for (auto & t : threads) {
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
t.join();
|
||||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
|
||||||
if (str != check) {
|
|
||||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
||||||
return 4;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -526,7 +526,7 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
||||||
offset += 1;
|
offset += 1;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x40)) {
|
if (!(utf8[offset + 0] & 0x40)) {
|
||||||
throw std::invalid_argument("invalid character");
|
throw std::invalid_argument("invalid character");
|
||||||
}
|
}
|
||||||
else if (!(utf8[offset + 0] & 0x20)) {
|
else if (!(utf8[offset + 0] & 0x20)) {
|
||||||
|
@ -614,6 +614,7 @@ static bool codepoint_type_init_search_vector() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static size_t binary_search_implement(uint32_t cp, const std::vector<std::pair<uint32_t, uint32_t>> & ranges) {
|
static size_t binary_search_implement(uint32_t cp, const std::vector<std::pair<uint32_t, uint32_t>> & ranges) {
|
||||||
size_t left = 0;
|
size_t left = 0;
|
||||||
size_t right = ranges.size() - 1;
|
size_t right = ranges.size() - 1;
|
||||||
|
@ -657,8 +658,8 @@ static bool codepoint_type_init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static int codepoint_type(uint32_t cp) {
|
static int codepoint_type(uint32_t cp) {
|
||||||
static bool codepoint_type_initialized = codepoint_type_init();
|
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||||
return codepoint_type_binary_search(cp);
|
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int codepoint_type(const std::string & utf8) {
|
static int codepoint_type(const std::string & utf8) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue