Merge branch 'master' into multiple-mount-points
This commit is contained in:
commit
00d5cdbdbe
41 changed files with 1846 additions and 1014 deletions
3
.github/workflows/build.yml
vendored
3
.github/workflows/build.yml
vendored
|
@ -669,8 +669,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cd examples/llama.android
|
cd examples/llama.android
|
||||||
|
|
||||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
./gradlew build --no-daemon
|
||||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
|
||||||
|
|
||||||
# freeBSD-latest:
|
# freeBSD-latest:
|
||||||
# runs-on: macos-12
|
# runs-on: macos-12
|
||||||
|
|
|
@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||||
# Raspberry Pi 2
|
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
# Android armeabi-v7a
|
||||||
|
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||||
|
else()
|
||||||
|
# Raspberry Pi 2
|
||||||
|
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||||
|
# Android arm64-v8a
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
ifndef CUDA_POWER_ARCH
|
ifndef CUDA_POWER_ARCH
|
||||||
|
|
|
@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||||
|
|
||||||
|
**HTTP server**
|
||||||
|
|
||||||
|
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -155,6 +158,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||||
|
- [Msty](https://msty.app) (proprietary)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string value(argv[i]);
|
std::string value(argv[i]);
|
||||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||||
else { invalid_param = true; break; }
|
else { invalid_param = true; break; }
|
||||||
} else if (arg == "--rope-scale") {
|
} else if (arg == "--rope-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
if (arg_next == "none") {
|
if (arg_next == "none") {
|
||||||
params.split_mode = LLAMA_SPLIT_NONE;
|
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
} else if (arg_next == "layer") {
|
} else if (arg_next == "layer") {
|
||||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
} else if (arg_next == "row") {
|
} else if (arg_next == "row") {
|
||||||
params.split_mode = LLAMA_SPLIT_ROW;
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
@ -837,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
sep++;
|
sep++;
|
||||||
if (strncmp(sep, "int:", 4) == 0) {
|
if (strncmp(sep, "int:", 4) == 0) {
|
||||||
sep += 4;
|
sep += 4;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.int_value = std::atol(sep);
|
kvo.int_value = std::atol(sep);
|
||||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||||
sep += 6;
|
sep += 6;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||||
kvo.float_value = std::atof(sep);
|
kvo.float_value = std::atof(sep);
|
||||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||||
sep += 5;
|
sep += 5;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||||
if (std::strcmp(sep, "true") == 0) {
|
if (std::strcmp(sep, "true") == 0) {
|
||||||
kvo.bool_value = true;
|
kvo.bool_value = true;
|
||||||
} else if (std::strcmp(sep, "false") == 0) {
|
} else if (std::strcmp(sep, "false") == 0) {
|
||||||
|
|
|
@ -61,7 +61,7 @@ struct gpt_params {
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
|
@ -75,7 +75,7 @@ struct gpt_params {
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
|
|
|
@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ struct train_state * init_train_state() {
|
||||||
|
|
||||||
state->opt = new struct ggml_opt_context;
|
state->opt = new struct ggml_opt_context;
|
||||||
state->opt->ctx = NULL;
|
state->opt->ctx = NULL;
|
||||||
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||||
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
state->opt->loss_after = 0.0f;
|
state->opt->loss_after = 0.0f;
|
||||||
|
|
||||||
|
@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||||
std::string opt_type;
|
std::string opt_type;
|
||||||
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
|
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
|
||||||
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
|
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
|
||||||
opt->params.type = GGML_OPT_ADAM;
|
opt->params.type = GGML_OPT_TYPE_ADAM;
|
||||||
|
|
||||||
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
|
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
|
||||||
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
|
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
|
||||||
|
@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||||
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
|
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
|
||||||
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
|
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
|
||||||
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
|
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
|
||||||
opt->params.type = GGML_OPT_LBFGS;
|
opt->params.type = GGML_OPT_TYPE_LBFGS;
|
||||||
|
|
||||||
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
|
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
|
||||||
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
|
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
|
||||||
|
@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||||
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
|
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
|
||||||
|
|
||||||
switch (opt->params.type) {
|
switch (opt->params.type) {
|
||||||
case GGML_OPT_ADAM:
|
case GGML_OPT_TYPE_ADAM:
|
||||||
{
|
{
|
||||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
|
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
|
||||||
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
|
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
|
||||||
|
@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||||
gguf_add_tensor(fctx, opt->adam.pf);
|
gguf_add_tensor(fctx, opt->adam.pf);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OPT_LBFGS:
|
case GGML_OPT_TYPE_LBFGS:
|
||||||
{
|
{
|
||||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
|
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
|
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
|
||||||
|
|
|
@ -192,7 +192,7 @@ class Model:
|
||||||
return RefactModel
|
return RefactModel
|
||||||
if model_architecture == "PersimmonForCausalLM":
|
if model_architecture == "PersimmonForCausalLM":
|
||||||
return PersimmonModel
|
return PersimmonModel
|
||||||
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return StableLMModel
|
return StableLMModel
|
||||||
if model_architecture == "QWenLMHeadModel":
|
if model_architecture == "QWenLMHeadModel":
|
||||||
return QwenModel
|
return QwenModel
|
||||||
|
@ -253,7 +253,7 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.REFACT
|
return gguf.MODEL_ARCH.REFACT
|
||||||
if arch == "PersimmonForCausalLM":
|
if arch == "PersimmonForCausalLM":
|
||||||
return gguf.MODEL_ARCH.PERSIMMON
|
return gguf.MODEL_ARCH.PERSIMMON
|
||||||
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return gguf.MODEL_ARCH.STABLELM
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
if arch == "QWenLMHeadModel":
|
if arch == "QWenLMHeadModel":
|
||||||
return gguf.MODEL_ARCH.QWEN
|
return gguf.MODEL_ARCH.QWEN
|
||||||
|
@ -1074,10 +1074,11 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||||
|
|
||||||
|
|
||||||
class MixtralModel(Model):
|
class MixtralModel(Model):
|
||||||
|
|
|
@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
|
||||||
opt_params_lbfgs.print_forward_graph = false;
|
opt_params_lbfgs.print_forward_graph = false;
|
||||||
opt_params_lbfgs.print_backward_graph = false;
|
opt_params_lbfgs.print_backward_graph = false;
|
||||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||||
|
|
|
@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
|
||||||
lora.hparams.n_rank_output = n_rank_output;
|
lora.hparams.n_rank_output = n_rank_output;
|
||||||
|
|
||||||
// set opt params from command line
|
// set opt params from command line
|
||||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||||
opt->params.print_forward_graph = false;
|
opt->params.print_forward_graph = false;
|
||||||
opt->params.print_backward_graph = false;
|
opt->params.print_backward_graph = false;
|
||||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
|
|
|
@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
|
|
|
@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
|
||||||
|
|
||||||
static const char * split_mode_str(llama_split_mode mode) {
|
static const char * split_mode_str(llama_split_mode mode) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case LLAMA_SPLIT_NONE: return "none";
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||||
case LLAMA_SPLIT_LAYER: return "layer";
|
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||||
case LLAMA_SPLIT_ROW: return "row";
|
case LLAMA_SPLIT_MODE_ROW: return "row";
|
||||||
default: GGML_ASSERT(!"invalid split mode");
|
default: GGML_ASSERT(!"invalid split mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* split_mode */ {LLAMA_SPLIT_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
/* no_kv_offload */ {false},
|
/* no_kv_offload */ {false},
|
||||||
/* mul_mat_q */ {true},
|
/* mul_mat_q */ {true},
|
||||||
|
@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
for (const auto & m : p) {
|
for (const auto & m : p) {
|
||||||
llama_split_mode mode;
|
llama_split_mode mode;
|
||||||
if (m == "none") {
|
if (m == "none") {
|
||||||
mode = LLAMA_SPLIT_NONE;
|
mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
} else if (m == "layer") {
|
} else if (m == "layer") {
|
||||||
mode = LLAMA_SPLIT_LAYER;
|
mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
} else if (m == "row") {
|
} else if (m == "row") {
|
||||||
mode = LLAMA_SPLIT_ROW;
|
mode = LLAMA_SPLIT_MODE_ROW;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -21,12 +21,8 @@ android {
|
||||||
useSupportLibrary = true
|
useSupportLibrary = true
|
||||||
}
|
}
|
||||||
ndk {
|
ndk {
|
||||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
// Add NDK properties if wanted, e.g.
|
||||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
// abiFilters += listOf("arm64-v8a")
|
||||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
|
||||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
|
||||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
externalNativeBuild {
|
externalNativeBuild {
|
||||||
cmake {
|
cmake {
|
||||||
|
|
|
@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
|
|
||||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||||
if (newline_tmp->backend != GGML_BACKEND_CPU) {
|
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||||
if (newline_tmp->buffer == NULL) {
|
if (newline_tmp->buffer == NULL) {
|
||||||
printf("newline_tmp tensor buffer is NULL\n");
|
printf("newline_tmp tensor buffer is NULL\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
|
@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||||
|
|
||||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||||
|
|
||||||
n_past -= bd;
|
n_past -= bd;
|
||||||
|
|
||||||
|
|
|
@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||||
const int n_batch = ctx_params.n_batch;
|
const int n_batch = ctx_params.n_batch;
|
||||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
|
@ -146,10 +146,11 @@ int main(int argc, char ** argv) {
|
||||||
const int ib = i/n_batch - 1;
|
const int ib = i/n_batch - 1;
|
||||||
const int bd = n_batch_grp*(n_grp - 1);
|
const int bd = n_batch_grp*(n_grp - 1);
|
||||||
|
|
||||||
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||||
|
llama_kv_cache_update (ctx);
|
||||||
|
|
||||||
n_past -= bd;
|
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
@ -179,10 +180,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
|
llama_kv_cache_defrag (ctx);
|
||||||
|
llama_kv_cache_update (ctx);
|
||||||
|
|
||||||
n_past -= n_discard;
|
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -208,10 +211,12 @@ int main(int argc, char ** argv) {
|
||||||
if (n_discard > 0) {
|
if (n_discard > 0) {
|
||||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
|
llama_kv_cache_defrag (ctx);
|
||||||
|
llama_kv_cache_update (ctx);
|
||||||
|
|
||||||
n_past -= n_discard;
|
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,20 @@
|
||||||
# llama.cpp/example/server
|
# LLaMA.cpp HTTP Server
|
||||||
|
|
||||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||||
|
|
||||||
Command line options:
|
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
* LLM inference of F16 and quantum models on GPU and CPU
|
||||||
|
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||||
|
* Parallel decoding with multi-user support
|
||||||
|
* Continuous batching
|
||||||
|
* Multimodal (wip)
|
||||||
|
* Monitoring endpoints
|
||||||
|
|
||||||
|
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||||
|
|
||||||
|
**Command line options:**
|
||||||
|
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
|
@ -39,9 +51,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
|
||||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
|
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
|
||||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
|
- `--log-disable`: Output logs to stdout only, default: enabled.
|
||||||
|
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -457,6 +472,18 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
|
||||||
|
|
||||||
|
Available metrics:
|
||||||
|
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
||||||
|
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
|
||||||
|
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
|
||||||
|
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
|
||||||
|
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
|
||||||
|
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
|
||||||
|
- `llamacpp:requests_processing`: Number of request processing.
|
||||||
|
- `llamacpp:requests_deferred`: Number of request deferred.
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -43,9 +43,11 @@ struct server_params
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
bool slots_endpoint = true;
|
bool slots_endpoint = true;
|
||||||
|
bool metrics_endpoint = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
|
bool server_log_json = true;
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
||||||
{
|
{
|
||||||
|
@ -301,12 +303,76 @@ struct llama_client_slot
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_timings() const {
|
void print_timings() const {
|
||||||
LOG_TEE("\n");
|
char buffer[512];
|
||||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
||||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
||||||
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
|
t_prompt_processing, num_prompt_tokens_processed,
|
||||||
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
|
t_token, n_tokens_second);
|
||||||
|
LOG_INFO(buffer, {
|
||||||
|
{"slot_id", id},
|
||||||
|
{"task_id", task_id},
|
||||||
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
|
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
||||||
|
{"t_token", t_token},
|
||||||
|
{"n_tokens_second", n_tokens_second},
|
||||||
|
});
|
||||||
|
|
||||||
|
t_token = t_token_generation / n_decoded;
|
||||||
|
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||||
|
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
|
t_token_generation, n_decoded,
|
||||||
|
t_token, n_tokens_second);
|
||||||
|
LOG_INFO(buffer, {
|
||||||
|
{"slot_id", id},
|
||||||
|
{"task_id", task_id},
|
||||||
|
{"t_token_generation", t_token_generation},
|
||||||
|
{"n_decoded", n_decoded},
|
||||||
|
{"t_token", t_token},
|
||||||
|
{"n_tokens_second", n_tokens_second},
|
||||||
|
});
|
||||||
|
|
||||||
|
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||||
|
LOG_INFO(buffer, {
|
||||||
|
{"slot_id", id},
|
||||||
|
{"task_id", task_id},
|
||||||
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
|
{"t_token_generation", t_token_generation},
|
||||||
|
{"t_total", t_prompt_processing + t_token_generation},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_metrics {
|
||||||
|
uint64_t n_prompt_tokens_processed_total = 0;
|
||||||
|
uint64_t n_tokens_predicted_total = 0;
|
||||||
|
|
||||||
|
uint64_t n_prompt_tokens_processed = 0;
|
||||||
|
uint64_t t_prompt_processing = 0;
|
||||||
|
|
||||||
|
uint64_t n_tokens_predicted = 0;
|
||||||
|
uint64_t t_tokens_generation = 0;
|
||||||
|
|
||||||
|
|
||||||
|
void on_prompt_eval(const llama_client_slot &slot) {
|
||||||
|
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
||||||
|
|
||||||
|
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
||||||
|
t_prompt_processing += slot.t_prompt_processing;
|
||||||
|
}
|
||||||
|
|
||||||
|
void on_prediction(const llama_client_slot &slot) {
|
||||||
|
n_tokens_predicted_total += slot.n_decoded;
|
||||||
|
|
||||||
|
n_tokens_predicted += slot.n_decoded;
|
||||||
|
t_tokens_generation += slot.t_token_generation;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset_bucket() {
|
||||||
|
n_prompt_tokens_processed = 0;
|
||||||
|
t_prompt_processing = 0;
|
||||||
|
n_tokens_predicted = 0;
|
||||||
|
t_tokens_generation = 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -344,6 +410,8 @@ struct llama_server_context
|
||||||
llama_server_queue queue_tasks;
|
llama_server_queue queue_tasks;
|
||||||
llama_server_response queue_results;
|
llama_server_response queue_results;
|
||||||
|
|
||||||
|
llama_metrics metrics;
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context()
|
||||||
{
|
{
|
||||||
if (ctx)
|
if (ctx)
|
||||||
|
@ -363,7 +431,7 @@ struct llama_server_context
|
||||||
params = params_;
|
params = params_;
|
||||||
if (!params.mmproj.empty()) {
|
if (!params.mmproj.empty()) {
|
||||||
multimodal = true;
|
multimodal = true;
|
||||||
LOG_TEE("Multi Modal Mode Enabled");
|
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||||
if(clp_ctx == nullptr) {
|
if(clp_ctx == nullptr) {
|
||||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||||
|
@ -416,7 +484,7 @@ struct llama_server_context
|
||||||
|
|
||||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||||
|
|
||||||
LOG_TEE("Available slots:\n");
|
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
||||||
for (int i = 0; i < params.n_parallel; i++)
|
for (int i = 0; i < params.n_parallel; i++)
|
||||||
{
|
{
|
||||||
llama_client_slot slot;
|
llama_client_slot slot;
|
||||||
|
@ -425,7 +493,10 @@ struct llama_server_context
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
slot.n_predict = params.n_predict;
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
LOG_INFO("new slot", {
|
||||||
|
{"slot_id", slot.id},
|
||||||
|
{"n_ctx_slot", slot.n_ctx}
|
||||||
|
});
|
||||||
|
|
||||||
const int ga_n = params.grp_attn_n;
|
const int ga_n = params.grp_attn_n;
|
||||||
const int ga_w = params.grp_attn_w;
|
const int ga_w = params.grp_attn_w;
|
||||||
|
@ -435,7 +506,12 @@ struct llama_server_context
|
||||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
|
||||||
|
LOG_INFO("slot self-extend", {
|
||||||
|
{"slot_id", slot.id},
|
||||||
|
{"ga_n", ga_n},
|
||||||
|
{"ga_w", ga_w}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
|
@ -729,10 +805,16 @@ struct llama_server_context
|
||||||
img_sl.img_data = clip_image_u8_init();
|
img_sl.img_data = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||||
{
|
{
|
||||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
LOG_ERROR("failed to load image", {
|
||||||
|
{"slot_id", slot->id},
|
||||||
|
{"img_sl_id", img_sl.id}
|
||||||
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
LOG_VERBOSE("image loaded", {
|
||||||
|
{"slot_id", slot->id},
|
||||||
|
{"img_sl_id", img_sl.id}
|
||||||
|
});
|
||||||
img_sl.request_encode_image = true;
|
img_sl.request_encode_image = true;
|
||||||
slot->images.push_back(img_sl);
|
slot->images.push_back(img_sl);
|
||||||
}
|
}
|
||||||
|
@ -792,7 +874,10 @@ struct llama_server_context
|
||||||
|
|
||||||
all_slots_are_idle = false;
|
all_slots_are_idle = false;
|
||||||
|
|
||||||
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
|
LOG_INFO("slot is processing task", {
|
||||||
|
{"slot_id", slot->id},
|
||||||
|
{"task_id", slot->task_id},
|
||||||
|
});
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -817,10 +902,24 @@ struct llama_server_context
|
||||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0)
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
return;
|
llama_batch batch_view = {
|
||||||
|
n_tokens,
|
||||||
|
batch.token + i,
|
||||||
|
nullptr,
|
||||||
|
batch.pos + i,
|
||||||
|
batch.n_seq_id + i,
|
||||||
|
batch.seq_id + i,
|
||||||
|
batch.logits + i,
|
||||||
|
0, 0, 0, // unused
|
||||||
|
};
|
||||||
|
if (llama_decode(ctx, batch_view) != 0)
|
||||||
|
{
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign the system KV cache to all parallel sequences
|
// assign the system KV cache to all parallel sequences
|
||||||
|
@ -1355,7 +1454,7 @@ struct llama_server_context
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
// if no slot is available, we defer this task for processing later
|
// if no slot is available, we defer this task for processing later
|
||||||
LOG_VERBOSE("no slot is available", {});
|
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1404,7 +1503,7 @@ struct llama_server_context
|
||||||
case TASK_TYPE_NEXT_RESPONSE: {
|
case TASK_TYPE_NEXT_RESPONSE: {
|
||||||
// do nothing
|
// do nothing
|
||||||
} break;
|
} break;
|
||||||
case TASK_TYPE_SLOTS_DATA: {
|
case TASK_TYPE_METRICS: {
|
||||||
json slots_data = json::array();
|
json slots_data = json::array();
|
||||||
int n_idle_slots = 0;
|
int n_idle_slots = 0;
|
||||||
int n_processing_slots = 0;
|
int n_processing_slots = 0;
|
||||||
|
@ -1431,17 +1530,41 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
slots_data.push_back(slot_data);
|
slots_data.push_back(slot_data);
|
||||||
}
|
}
|
||||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
LOG_INFO("slot data", {
|
||||||
|
{"task_id", task.id},
|
||||||
|
{"n_idle_slots", n_idle_slots},
|
||||||
|
{"n_processing_slots", n_processing_slots}
|
||||||
|
});
|
||||||
|
LOG_VERBOSE("slot data", {
|
||||||
|
{"task_id", task.id},
|
||||||
|
{"n_idle_slots", n_idle_slots},
|
||||||
|
{"n_processing_slots", n_processing_slots},
|
||||||
|
{"slots", slots_data}
|
||||||
|
});
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = task.id;
|
res.id = task.id;
|
||||||
res.multitask_id = task.multitask_id;
|
res.multitask_id = task.multitask_id;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.result_json = {
|
res.result_json = {
|
||||||
{ "idle", n_idle_slots },
|
{ "idle", n_idle_slots },
|
||||||
{ "processing", n_processing_slots },
|
{ "processing", n_processing_slots },
|
||||||
{ "slots", slots_data }
|
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||||
|
|
||||||
|
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||||
|
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||||
|
|
||||||
|
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||||
|
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||||
|
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||||
|
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||||
|
|
||||||
|
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||||
|
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||||
|
|
||||||
|
{ "slots", slots_data },
|
||||||
};
|
};
|
||||||
|
metrics.reset_bucket();
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -1469,7 +1592,7 @@ struct llama_server_context
|
||||||
bool update_slots() {
|
bool update_slots() {
|
||||||
if (system_need_update)
|
if (system_need_update)
|
||||||
{
|
{
|
||||||
LOG_TEE("updating system prompt\n");
|
LOG_INFO("updating system prompt", {});
|
||||||
update_system_prompt();
|
update_system_prompt();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1479,12 +1602,13 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
if (system_prompt.empty() && clean_kv_cache)
|
if (system_prompt.empty() && clean_kv_cache)
|
||||||
{
|
{
|
||||||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||||
task_server task;
|
task_server task;
|
||||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||||
task.target_id = -1;
|
task.target_id = -1;
|
||||||
|
@ -1498,12 +1622,22 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
// Shift context
|
// Shift context
|
||||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||||
const int n_discard = n_left / 2;
|
const int n_discard = n_left / 2;
|
||||||
|
|
||||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
LOG_INFO("slot context shift", {
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
{"slot_id", slot.id},
|
||||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
{"task_id", slot.task_id},
|
||||||
|
{"n_keep", n_keep},
|
||||||
|
{"n_left", n_left},
|
||||||
|
{"n_discard", n_discard},
|
||||||
|
{"n_ctx", n_ctx},
|
||||||
|
{"n_past", slot.n_past},
|
||||||
|
{"n_system_tokens", system_tokens.size()},
|
||||||
|
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||||
|
});
|
||||||
|
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||||
|
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||||
|
|
||||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||||
{
|
{
|
||||||
|
@ -1515,17 +1649,12 @@ struct llama_server_context
|
||||||
slot.n_past -= n_discard;
|
slot.n_past -= n_discard;
|
||||||
|
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
|
|
||||||
LOG_VERBOSE("context shift", {
|
|
||||||
{ "n_ctx", n_ctx },
|
|
||||||
{ "n_keep", n_keep },
|
|
||||||
{ "n_left", n_left },
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
|
LOG_VERBOSE("decoding ongoing sequences", {});
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
// release the slot
|
// release the slot
|
||||||
|
@ -1535,7 +1664,15 @@ struct llama_server_context
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
slot.t_last_used = ggml_time_us();
|
slot.t_last_used = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
LOG_INFO("slot released", {
|
||||||
|
{"slot_id", slot.id},
|
||||||
|
{"task_id", slot.task_id},
|
||||||
|
{"n_ctx", n_ctx},
|
||||||
|
{"n_past", slot.n_past},
|
||||||
|
{"n_system_tokens", system_tokens.size()},
|
||||||
|
{"n_cache_tokens", slot.cache_tokens.size()},
|
||||||
|
{"truncated", slot.truncated}
|
||||||
|
});
|
||||||
queue_tasks.notify_slot_changed();
|
queue_tasks.notify_slot_changed();
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
|
@ -1662,6 +1799,14 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
|
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||||
|
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||||
|
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
||||||
|
{
|
||||||
|
slot.n_past -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||||
|
|
||||||
if (slot.ga_n != 1)
|
if (slot.ga_n != 1)
|
||||||
|
@ -1683,7 +1828,12 @@ struct llama_server_context
|
||||||
slot.ga_i = ga_i;
|
slot.ga_i = ga_i;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
LOG_INFO("slot progression", {
|
||||||
|
{ "slot_id", slot.id },
|
||||||
|
{ "task_id", slot.task_id },
|
||||||
|
{ "n_past", slot.n_past },
|
||||||
|
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.cache_tokens = prompt_tokens;
|
slot.cache_tokens = prompt_tokens;
|
||||||
|
@ -1691,7 +1841,10 @@ struct llama_server_context
|
||||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||||
|
{ "slot_id", slot.id },
|
||||||
|
{ "task_id", slot.task_id }
|
||||||
|
});
|
||||||
slot.n_past--;
|
slot.n_past--;
|
||||||
if (slot.ga_i > 0)
|
if (slot.ga_i > 0)
|
||||||
{
|
{
|
||||||
|
@ -1699,9 +1852,13 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||||
|
LOG_INFO("kv cache rm [p0, end)", {
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
{ "slot_id", slot.id },
|
||||||
|
{ "task_id", slot.task_id },
|
||||||
|
{ "p0", p0 }
|
||||||
|
});
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", slot.n_past},
|
{"n_past", slot.n_past},
|
||||||
|
@ -1736,7 +1893,13 @@ struct llama_server_context
|
||||||
|
|
||||||
if (has_images && !ingest_images(slot, n_batch))
|
if (has_images && !ingest_images(slot, n_batch))
|
||||||
{
|
{
|
||||||
LOG_TEE("failed processing images\n");
|
LOG_ERROR("failed processing images", {
|
||||||
|
"slot_id", slot.id,
|
||||||
|
"task_id", slot.task_id,
|
||||||
|
});
|
||||||
|
// FIXME @phymbert: to be properly tested
|
||||||
|
// early returning without changing the slot state will block the slot for ever
|
||||||
|
// no one at the moment is checking the return value
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1778,9 +1941,9 @@ struct llama_server_context
|
||||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||||
|
|
||||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||||
|
|
||||||
slot.n_past_se -= bd;
|
slot.n_past_se -= bd;
|
||||||
|
|
||||||
|
@ -1836,7 +1999,7 @@ struct llama_server_context
|
||||||
send_embedding(slot);
|
send_embedding(slot);
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
return true;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
completion_token_output result;
|
completion_token_output result;
|
||||||
|
@ -1849,6 +2012,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
slot.t_start_genereration = ggml_time_us();
|
slot.t_start_genereration = ggml_time_us();
|
||||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||||
|
metrics.on_prompt_eval(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||||
|
@ -1871,11 +2035,14 @@ struct llama_server_context
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
|
metrics.on_prediction(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_VERBOSE("slots updated", {});
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1953,8 +2120,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||||
printf(" KV cache data type for V (default: f16)\n");
|
printf(" KV cache data type for V (default: f16)\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
|
printf(" --log-format log output format: json or text (default: json)\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
|
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
|
@ -2086,9 +2255,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string value(argv[i]);
|
std::string value(argv[i]);
|
||||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||||
else { invalid_param = true; break; }
|
else { invalid_param = true; break; }
|
||||||
}
|
}
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
|
@ -2212,15 +2381,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
if (arg_next == "none")
|
if (arg_next == "none")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_NONE;
|
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
}
|
}
|
||||||
else if (arg_next == "layer")
|
else if (arg_next == "layer")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
}
|
}
|
||||||
else if (arg_next == "row")
|
else if (arg_next == "row")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_ROW;
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -2405,6 +2574,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.mmproj = argv[i];
|
params.mmproj = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (arg == "--log-format")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (std::strcmp(argv[i], "json") == 0)
|
||||||
|
{
|
||||||
|
server_log_json = true;
|
||||||
|
}
|
||||||
|
else if (std::strcmp(argv[i], "text") == 0)
|
||||||
|
{
|
||||||
|
server_log_json = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
else if (arg == "--log-disable")
|
else if (arg == "--log-disable")
|
||||||
{
|
{
|
||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
|
@ -2414,6 +2604,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
{
|
{
|
||||||
sparams.slots_endpoint = false;
|
sparams.slots_endpoint = false;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--metrics")
|
||||||
|
{
|
||||||
|
sparams.metrics_endpoint = true;
|
||||||
|
}
|
||||||
else if (arg == "--chat-template")
|
else if (arg == "--chat-template")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2447,15 +2641,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
sep++;
|
sep++;
|
||||||
if (strncmp(sep, "int:", 4) == 0) {
|
if (strncmp(sep, "int:", 4) == 0) {
|
||||||
sep += 4;
|
sep += 4;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.int_value = std::atol(sep);
|
kvo.int_value = std::atol(sep);
|
||||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||||
sep += 6;
|
sep += 6;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||||
kvo.float_value = std::atof(sep);
|
kvo.float_value = std::atof(sep);
|
||||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||||
sep += 5;
|
sep += 5;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||||
if (std::strcmp(sep, "true") == 0) {
|
if (std::strcmp(sep, "true") == 0) {
|
||||||
kvo.bool_value = true;
|
kvo.bool_value = true;
|
||||||
} else if (std::strcmp(sep, "false") == 0) {
|
} else if (std::strcmp(sep, "false") == 0) {
|
||||||
|
@ -2514,32 +2708,40 @@ static json format_partial_response(
|
||||||
|
|
||||||
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||||
{
|
{
|
||||||
return json{
|
return json {
|
||||||
{"tokens", tokens}};
|
{"tokens", tokens}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
static json format_detokenized_response(std::string content)
|
static json format_detokenized_response(std::string content)
|
||||||
{
|
{
|
||||||
return json{
|
return json {
|
||||||
{"content", content}};
|
{"content", content}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
// skip GH copilot requests when using default port
|
||||||
|
if (req.path == "/v1/health" || req.path == "/v1/completions")
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
LOG_INFO("request", {
|
LOG_INFO("request", {
|
||||||
{"remote_addr", req.remote_addr},
|
{"remote_addr", req.remote_addr},
|
||||||
{"remote_port", req.remote_port},
|
{"remote_port", req.remote_port},
|
||||||
{"status", res.status},
|
{"status", res.status},
|
||||||
{"method", req.method},
|
{"method", req.method},
|
||||||
{"path", req.path},
|
{"path", req.path},
|
||||||
{"params", req.params},
|
{"params", req.params},
|
||||||
});
|
});
|
||||||
|
|
||||||
LOG_VERBOSE("request", {
|
LOG_VERBOSE("request", {
|
||||||
{"request", req.body},
|
{"request", req.body},
|
||||||
{"response", res.body},
|
{"response", res.body},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
struct token_translator
|
struct token_translator
|
||||||
|
@ -2621,7 +2823,7 @@ int main(int argc, char **argv)
|
||||||
// request slots data using task queue
|
// request slots data using task queue
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = llama.queue_tasks.get_new_id();
|
task.id = llama.queue_tasks.get_new_id();
|
||||||
task.type = TASK_TYPE_SLOTS_DATA;
|
task.type = TASK_TYPE_METRICS;
|
||||||
task.target_id = -1;
|
task.target_id = -1;
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(task.id);
|
llama.queue_results.add_waiting_task_id(task.id);
|
||||||
|
@ -2668,7 +2870,7 @@ int main(int argc, char **argv)
|
||||||
// request slots data using task queue
|
// request slots data using task queue
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = llama.queue_tasks.get_new_id();
|
task.id = llama.queue_tasks.get_new_id();
|
||||||
task.type = TASK_TYPE_SLOTS_DATA;
|
task.type = TASK_TYPE_METRICS;
|
||||||
task.target_id = -1;
|
task.target_id = -1;
|
||||||
|
|
||||||
llama.queue_results.add_waiting_task_id(task.id);
|
llama.queue_results.add_waiting_task_id(task.id);
|
||||||
|
@ -2683,6 +2885,87 @@ int main(int argc, char **argv)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sparams.metrics_endpoint) {
|
||||||
|
svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
|
||||||
|
// request slots data using task queue
|
||||||
|
task_server task;
|
||||||
|
task.id = llama.queue_tasks.get_new_id();
|
||||||
|
task.type = TASK_TYPE_METRICS;
|
||||||
|
task.target_id = -1;
|
||||||
|
|
||||||
|
llama.queue_results.add_waiting_task_id(task.id);
|
||||||
|
llama.queue_tasks.post(task);
|
||||||
|
|
||||||
|
// get the result
|
||||||
|
task_result result = llama.queue_results.recv(task.id);
|
||||||
|
llama.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
|
json data = result.result_json;
|
||||||
|
|
||||||
|
uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
|
||||||
|
uint64_t t_prompt_processing = data["t_prompt_processing"];
|
||||||
|
|
||||||
|
uint64_t n_tokens_predicted = data["n_tokens_predicted"];
|
||||||
|
uint64_t t_tokens_generation = data["t_tokens_generation"];
|
||||||
|
|
||||||
|
int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
|
||||||
|
|
||||||
|
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
|
json all_metrics_def = json {
|
||||||
|
{"counter", {{
|
||||||
|
{"name", "prompt_tokens_total"},
|
||||||
|
{"help", "Number of prompt tokens processed."},
|
||||||
|
{"value", data["n_prompt_tokens_processed_total"]}
|
||||||
|
}, {
|
||||||
|
{"name", "tokens_predicted_total"},
|
||||||
|
{"help", "Number of generation tokens processed."},
|
||||||
|
{"value", data["n_tokens_predicted_total"]}
|
||||||
|
}}},
|
||||||
|
{"gauge", {{
|
||||||
|
{"name", "prompt_tokens_seconds"},
|
||||||
|
{"help", "Average prompt throughput in tokens/s."},
|
||||||
|
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
|
||||||
|
},{
|
||||||
|
{"name", "predicted_tokens_seconds"},
|
||||||
|
{"help", "Average generation throughput in tokens/s."},
|
||||||
|
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
|
||||||
|
},{
|
||||||
|
{"name", "kv_cache_usage_ratio"},
|
||||||
|
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||||
|
{"value", 1. * kv_cache_used_cells / params.n_ctx}
|
||||||
|
},{
|
||||||
|
{"name", "kv_cache_tokens"},
|
||||||
|
{"help", "KV-cache tokens."},
|
||||||
|
{"value", data["kv_cache_tokens_count"]}
|
||||||
|
},{
|
||||||
|
{"name", "requests_processing"},
|
||||||
|
{"help", "Number of request processing."},
|
||||||
|
{"value", data["processing"]}
|
||||||
|
},{
|
||||||
|
{"name", "requests_deferred"},
|
||||||
|
{"help", "Number of request deferred."},
|
||||||
|
{"value", data["deferred"]}
|
||||||
|
}}}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::stringstream prometheus;
|
||||||
|
for (const auto& el : all_metrics_def.items()) {
|
||||||
|
const auto& type = el.key();
|
||||||
|
const auto& metrics_def = el.value();
|
||||||
|
for (const auto& metric_def : metrics_def) {
|
||||||
|
std::string name = metric_def["name"];
|
||||||
|
std::string help = metric_def["help"];
|
||||||
|
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||||
|
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
||||||
|
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
svr.set_logger(log_server_request);
|
svr.set_logger(log_server_request);
|
||||||
|
|
||||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||||
|
@ -2735,9 +3018,6 @@ int main(int argc, char **argv)
|
||||||
// Set the base directory for serving static files
|
// Set the base directory for serving static files
|
||||||
svr.set_base_dir(sparams.public_path);
|
svr.set_base_dir(sparams.public_path);
|
||||||
|
|
||||||
// to make it ctrl+clickable:
|
|
||||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
|
||||||
|
|
||||||
std::unordered_map<std::string, std::string> log_data;
|
std::unordered_map<std::string, std::string> log_data;
|
||||||
log_data["hostname"] = sparams.hostname;
|
log_data["hostname"] = sparams.hostname;
|
||||||
log_data["port"] = std::to_string(sparams.port);
|
log_data["port"] = std::to_string(sparams.port);
|
||||||
|
|
|
@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables:
|
||||||
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
|
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
|
||||||
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
|
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
|
||||||
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
|
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
|
||||||
|
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
|
||||||
|
|
||||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ def before_scenario(context, scenario):
|
||||||
|
|
||||||
|
|
||||||
def after_scenario(context, scenario):
|
def after_scenario(context, scenario):
|
||||||
|
if context.server_process is None:
|
||||||
|
return
|
||||||
if scenario.status == "failed":
|
if scenario.status == "failed":
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||||
|
|
|
@ -1,36 +1,4 @@
|
||||||
# List of ongoing issues
|
# List of ongoing issues
|
||||||
@bug
|
@bug
|
||||||
Feature: Issues
|
Feature: Issues
|
||||||
# Issue #5655
|
# No confirmed issue at the moment
|
||||||
Scenario: Multi users embeddings
|
|
||||||
Given a server listening on localhost:8080
|
|
||||||
And a model file stories260K.gguf
|
|
||||||
And a model alias tinyllama-2
|
|
||||||
And 42 as server seed
|
|
||||||
And 64 KV cache size
|
|
||||||
And 2 slots
|
|
||||||
And continuous batching
|
|
||||||
And embeddings extraction
|
|
||||||
Then the server is starting
|
|
||||||
Then the server is healthy
|
|
||||||
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long story about AI.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write another very long music lyrics.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long poem.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long joke.
|
|
||||||
"""
|
|
||||||
Given concurrent embedding requests
|
|
||||||
Then the server is busy
|
|
||||||
Then the server is idle
|
|
||||||
Then all embeddings are generated
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ Feature: Parallel
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 64 KV cache size
|
And 64 KV cache size
|
||||||
And 2 slots
|
And 2 slots
|
||||||
|
And embeddings extraction
|
||||||
And continuous batching
|
And continuous batching
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
@ -97,3 +98,48 @@ Feature: Parallel
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
Then the server is idle
|
Then the server is idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
||||||
|
|
||||||
|
Scenario: Multi users embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another very long music lyrics.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long joke.
|
||||||
|
"""
|
||||||
|
Given concurrent embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Multi users OAI compatibility embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the biggest US city ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the capital of Bulgaria ?
|
||||||
|
"""
|
||||||
|
And a model tinyllama-2
|
||||||
|
Given concurrent OAI embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
|
@ -13,6 +13,7 @@ Feature: llama.cpp server
|
||||||
And 1 slots
|
And 1 slots
|
||||||
And embeddings extraction
|
And embeddings extraction
|
||||||
And 32 server max tokens to predict
|
And 32 server max tokens to predict
|
||||||
|
And prometheus compatible metrics exposed
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
|
||||||
|
@ -25,11 +26,12 @@ Feature: llama.cpp server
|
||||||
And <n_predict> max tokens to predict
|
And <n_predict> max tokens to predict
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then <n_predicted> tokens are predicted matching <re_content>
|
Then <n_predicted> tokens are predicted matching <re_content>
|
||||||
|
And prometheus metrics are exposed
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| prompt | n_predict | re_content | n_predicted |
|
| prompt | n_predict | re_content | n_predicted |
|
||||||
| I believe the meaning of life is | 8 | read | 8 |
|
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
|
||||||
| Write a joke about AI | 64 | (park<or>friends<or>scared)+ | 32 |
|
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility
|
Scenario Outline: OAI Compatibility
|
||||||
Given a model <model>
|
Given a model <model>
|
||||||
|
@ -60,6 +62,19 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
Then embeddings are generated
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||||
|
Given a model tinyllama-2
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
When an OAI compatible embeddings computation request for multiple inputs
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
|
||||||
Scenario: Tokenize / Detokenize
|
Scenario: Tokenize / Detokenize
|
||||||
When tokenizing:
|
When tokenizing:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import collections
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -12,6 +13,7 @@ import aiohttp
|
||||||
import openai
|
import openai
|
||||||
from behave import step
|
from behave import step
|
||||||
from behave.api.async_step import async_run_until_complete
|
from behave.api.async_step import async_run_until_complete
|
||||||
|
from prometheus_client import parser
|
||||||
|
|
||||||
|
|
||||||
@step(u"a server listening on {server_fqdn}:{server_port}")
|
@step(u"a server listening on {server_fqdn}:{server_port}")
|
||||||
|
@ -33,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.server_api_key = None
|
context.server_api_key = None
|
||||||
context.server_continuous_batching = False
|
context.server_continuous_batching = False
|
||||||
context.server_embeddings = False
|
context.server_embeddings = False
|
||||||
|
context.server_metrics = False
|
||||||
|
context.server_process = None
|
||||||
context.server_seed = None
|
context.server_seed = None
|
||||||
context.user_api_key = None
|
context.user_api_key = None
|
||||||
|
|
||||||
|
@ -81,6 +85,11 @@ def step_server_embeddings(context):
|
||||||
context.server_embeddings = True
|
context.server_embeddings = True
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'prometheus compatible metrics exposed')
|
||||||
|
def step_server_metrics(context):
|
||||||
|
context.server_metrics = True
|
||||||
|
|
||||||
|
|
||||||
@step(u"the server is starting")
|
@step(u"the server is starting")
|
||||||
def step_start_server(context):
|
def step_start_server(context):
|
||||||
start_server_background(context)
|
start_server_background(context)
|
||||||
|
@ -262,57 +271,56 @@ def step_a_prompt_prompt(context, prompt):
|
||||||
@step(u'concurrent completion requests')
|
@step(u'concurrent completion requests')
|
||||||
@async_run_until_complete()
|
@async_run_until_complete()
|
||||||
async def step_concurrent_completion_requests(context):
|
async def step_concurrent_completion_requests(context):
|
||||||
await concurrent_completion_requests(context,
|
await concurrent_requests(context,
|
||||||
request_completion,
|
request_completion,
|
||||||
# prompt is inserted automatically
|
# prompt is inserted automatically
|
||||||
context.base_url,
|
context.base_url,
|
||||||
debug=context.debug,
|
debug=context.debug,
|
||||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||||
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
|
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
|
||||||
user_api_key=context.user_api_key if hasattr(context,
|
user_api_key=context.user_api_key if hasattr(context,
|
||||||
'user_api_key') else None)
|
'user_api_key') else None)
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent OAI completions requests')
|
@step(u'concurrent OAI completions requests')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_oai_chat_completions(context):
|
async def step_oai_chat_completions(context):
|
||||||
await concurrent_completion_requests(context, oai_chat_completions,
|
await concurrent_requests(context, oai_chat_completions,
|
||||||
# user_prompt is inserted automatically
|
# user_prompt is inserted automatically
|
||||||
context.system_prompt,
|
context.system_prompt,
|
||||||
context.base_url,
|
context.base_url,
|
||||||
'/v1/chat/completions',
|
True, # async_client
|
||||||
True, # async_client
|
model=context.model
|
||||||
model=context.model
|
if hasattr(context, 'model') else None,
|
||||||
if hasattr(context, 'model') else None,
|
n_predict=context.n_predict
|
||||||
n_predict=context.n_predict
|
if hasattr(context, 'n_predict') else None,
|
||||||
if hasattr(context, 'n_predict') else None,
|
enable_streaming=context.enable_streaming
|
||||||
enable_streaming=context.enable_streaming
|
if hasattr(context, 'enable_streaming') else None,
|
||||||
if hasattr(context, 'enable_streaming') else None,
|
server_seed=context.server_seed
|
||||||
server_seed=context.server_seed
|
if hasattr(context, 'server_seed') else None,
|
||||||
if hasattr(context, 'server_seed') else None,
|
user_api_key=context.user_api_key
|
||||||
user_api_key=context.user_api_key
|
if hasattr(context, 'user_api_key') else None)
|
||||||
if hasattr(context, 'user_api_key') else None)
|
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent OAI completions requests no v1')
|
@step(u'concurrent OAI completions requests no v1')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_oai_chat_completions(context):
|
async def step_oai_chat_completions(context):
|
||||||
await concurrent_completion_requests(context, oai_chat_completions,
|
await concurrent_requests(context, oai_chat_completions,
|
||||||
# user_prompt is inserted automatically
|
# user_prompt is inserted automatically
|
||||||
context.system_prompt,
|
context.system_prompt,
|
||||||
context.base_url,
|
context.base_url,
|
||||||
'/chat/completions',
|
'/chat/completions',
|
||||||
True, # async_client
|
True, # async_client
|
||||||
model=context.model
|
model=context.model
|
||||||
if hasattr(context, 'model') else None,
|
if hasattr(context, 'model') else None,
|
||||||
n_predict=context.n_predict
|
n_predict=context.n_predict
|
||||||
if hasattr(context, 'n_predict') else None,
|
if hasattr(context, 'n_predict') else None,
|
||||||
enable_streaming=context.enable_streaming
|
enable_streaming=context.enable_streaming
|
||||||
if hasattr(context, 'enable_streaming') else None,
|
if hasattr(context, 'enable_streaming') else None,
|
||||||
server_seed=context.server_seed
|
server_seed=context.server_seed
|
||||||
if hasattr(context, 'server_seed') else None,
|
if hasattr(context, 'server_seed') else None,
|
||||||
user_api_key=context.user_api_key
|
user_api_key=context.user_api_key
|
||||||
if hasattr(context, 'user_api_key') else None)
|
if hasattr(context, 'user_api_key') else None)
|
||||||
|
|
||||||
|
|
||||||
@step(u'all prompts are predicted')
|
@step(u'all prompts are predicted')
|
||||||
|
@ -339,36 +347,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
||||||
@step(u'embeddings are computed for')
|
@step(u'embeddings are computed for')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_compute_embedding(context):
|
async def step_compute_embedding(context):
|
||||||
content = context.text
|
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
||||||
base_url = context.base_url
|
|
||||||
context.embeddings = await request_embedding(content, base_url)
|
|
||||||
|
|
||||||
|
|
||||||
@step(u'embeddings are generated')
|
@step(u'embeddings are generated')
|
||||||
def step_assert_embeddings(context):
|
def step_assert_embeddings(context):
|
||||||
assert_embeddings(context.embeddings)
|
if len(context.prompts) == 0:
|
||||||
|
assert_embeddings(context.embeddings)
|
||||||
|
else:
|
||||||
|
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
|
||||||
|
f"context.prompts={context.prompts}\n"
|
||||||
|
f"context.embeddings={context.embeddings}")
|
||||||
|
for embedding in context.embeddings:
|
||||||
|
context.prompts.pop()
|
||||||
|
assert_embeddings(embedding)
|
||||||
|
|
||||||
|
|
||||||
@step(u'an OAI compatible embeddings computation request for')
|
@step(u'an OAI compatible embeddings computation request for')
|
||||||
def step_oai_compute_embedding(context):
|
@async_run_until_complete
|
||||||
openai.api_key = 'nope' # openai client always expects an api_keu
|
async def step_oai_compute_embeddings(context):
|
||||||
if context.user_api_key is not None:
|
context.embeddings = await request_oai_embeddings(context.text,
|
||||||
openai.api_key = context.user_api_key
|
base_url=context.base_url,
|
||||||
openai.api_base = f'{context.base_url}/v1'
|
user_api_key=context.user_api_key,
|
||||||
embeddings = openai.Embedding.create(
|
model=context.model)
|
||||||
model=context.model,
|
|
||||||
input=context.text,
|
|
||||||
)
|
@step(u'an OAI compatible embeddings computation request for multiple inputs')
|
||||||
context.embeddings = embeddings
|
@async_run_until_complete
|
||||||
|
async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||||
|
context.embeddings = await request_oai_embeddings(context.prompts,
|
||||||
|
base_url=context.base_url,
|
||||||
|
user_api_key=context.user_api_key,
|
||||||
|
model=context.model)
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent embedding requests')
|
@step(u'concurrent embedding requests')
|
||||||
@async_run_until_complete()
|
@async_run_until_complete()
|
||||||
async def step_concurrent_embedding_requests(context):
|
async def step_concurrent_embedding_requests(context):
|
||||||
await concurrent_completion_requests(context,
|
await concurrent_requests(context,
|
||||||
request_embedding,
|
request_embedding,
|
||||||
# prompt is inserted automatically
|
# prompt is inserted automatically
|
||||||
context.base_url)
|
base_url=context.base_url)
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'concurrent OAI embedding requests')
|
||||||
|
@async_run_until_complete()
|
||||||
|
async def step_concurrent_oai_embedding_requests(context):
|
||||||
|
await concurrent_requests(context,
|
||||||
|
request_oai_embeddings,
|
||||||
|
# prompt is inserted automatically
|
||||||
|
base_url=context.base_url,
|
||||||
|
async_client=True,
|
||||||
|
model=context.model)
|
||||||
|
|
||||||
|
|
||||||
@step(u'all embeddings are generated')
|
@step(u'all embeddings are generated')
|
||||||
|
@ -424,7 +454,24 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
|
||||||
assert context.options_response.headers[cors_header] == cors_header_value
|
assert context.options_response.headers[cors_header] == cors_header_value
|
||||||
|
|
||||||
|
|
||||||
async def concurrent_completion_requests(context, f_completion, *args, **kwargs):
|
@step(u'prometheus metrics are exposed')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_prometheus_metrics_exported(context):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
||||||
|
assert metrics_response.status == 200
|
||||||
|
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
||||||
|
metrics_raw = await metrics_response.text()
|
||||||
|
metric_exported = False
|
||||||
|
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||||
|
match metric.name:
|
||||||
|
case "llamacpp:kv_cache_usage_ratio":
|
||||||
|
assert len(metric.samples) > 0
|
||||||
|
metric_exported = True
|
||||||
|
assert metric_exported, "No metrics exported"
|
||||||
|
|
||||||
|
|
||||||
|
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||||
n_prompts = len(context.prompts)
|
n_prompts = len(context.prompts)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"starting {n_prompts} concurrent completion requests...")
|
print(f"starting {n_prompts} concurrent completion requests...")
|
||||||
|
@ -589,7 +636,7 @@ async def oai_chat_completions(user_prompt,
|
||||||
return completion_response
|
return completion_response
|
||||||
|
|
||||||
|
|
||||||
async def request_embedding(content, base_url):
|
async def request_embedding(content, base_url=None):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(f'{base_url}/embedding',
|
async with session.post(f'{base_url}/embedding',
|
||||||
json={
|
json={
|
||||||
|
@ -600,6 +647,46 @@ async def request_embedding(content, base_url):
|
||||||
return response_json['embedding']
|
return response_json['embedding']
|
||||||
|
|
||||||
|
|
||||||
|
async def request_oai_embeddings(input,
|
||||||
|
base_url=None, user_api_key=None,
|
||||||
|
model=None, async_client=False):
|
||||||
|
# openai client always expects an api_key
|
||||||
|
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||||
|
if async_client:
|
||||||
|
origin = 'llama.cpp'
|
||||||
|
if user_api_key is not None:
|
||||||
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{base_url}/v1/embeddings',
|
||||||
|
json={
|
||||||
|
"input": input,
|
||||||
|
"model": model,
|
||||||
|
},
|
||||||
|
headers=headers) as response:
|
||||||
|
assert response.status == 200, f"received status code not expected: {response.status}"
|
||||||
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
|
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||||
|
response_json = await response.json()
|
||||||
|
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
||||||
|
assert response_json['object'] == 'list'
|
||||||
|
return response_json['data']
|
||||||
|
else:
|
||||||
|
openai.api_key = user_api_key
|
||||||
|
openai.api_base = f'{base_url}/v1'
|
||||||
|
oai_embeddings = openai.Embedding.create(
|
||||||
|
model=model,
|
||||||
|
input=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(input, collections.abc.Sequence):
|
||||||
|
embeddings = []
|
||||||
|
for an_oai_embeddings in oai_embeddings.data:
|
||||||
|
embeddings.append(an_oai_embeddings.embedding)
|
||||||
|
else:
|
||||||
|
embeddings = oai_embeddings.data.embedding
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
|
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
|
||||||
content = completion_response['content']
|
content = completion_response['content']
|
||||||
n_predicted = completion_response['timings']['predicted_n']
|
n_predicted = completion_response['timings']['predicted_n']
|
||||||
|
@ -635,6 +722,8 @@ async def wait_for_health_status(context,
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||||
timeout = 3 # seconds
|
timeout = 3 # seconds
|
||||||
|
if expected_health_status == 'ok':
|
||||||
|
timeout = 10 # CI slow inference
|
||||||
interval = 0.5
|
interval = 0.5
|
||||||
counter = 0
|
counter = 0
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
@ -672,7 +761,7 @@ async def wait_for_health_status(context,
|
||||||
if n_completions > 0:
|
if n_completions > 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
assert False, 'timeout exceeded'
|
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||||
|
|
||||||
|
|
||||||
def assert_embeddings(embeddings):
|
def assert_embeddings(embeddings):
|
||||||
|
@ -714,6 +803,8 @@ def start_server_background(context):
|
||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
if context.server_embeddings:
|
if context.server_embeddings:
|
||||||
server_args.append('--embedding')
|
server_args.append('--embedding')
|
||||||
|
if context.server_metrics:
|
||||||
|
server_args.append('--metrics')
|
||||||
if context.model_alias is not None:
|
if context.model_alias is not None:
|
||||||
server_args.extend(['--alias', context.model_alias])
|
server_args.extend(['--alias', context.model_alias])
|
||||||
if context.n_ctx is not None:
|
if context.n_ctx is not None:
|
||||||
|
@ -726,6 +817,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--api-key', context.server_api_key])
|
server_args.extend(['--api-key', context.server_api_key])
|
||||||
if context.debug:
|
if context.debug:
|
||||||
server_args.append('--verbose')
|
server_args.append('--verbose')
|
||||||
|
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||||
|
server_args.extend(['--log-format', "text"])
|
||||||
print(f"starting server with: {context.server_path}", *server_args)
|
print(f"starting server with: {context.server_path}", *server_args)
|
||||||
context.server_process = subprocess.Popen(
|
context.server_process = subprocess.Popen(
|
||||||
[str(arg) for arg in [context.server_path, *server_args]],
|
[str(arg) for arg in [context.server_path, *server_args]],
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
aiohttp~=3.9.3
|
aiohttp~=3.9.3
|
||||||
behave~=1.2.6
|
behave~=1.2.6
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
|
prometheus-client~=0.20.0
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
extern bool server_verbose;
|
extern bool server_verbose;
|
||||||
|
extern bool server_log_json;
|
||||||
|
|
||||||
#ifndef SERVER_VERBOSE
|
#ifndef SERVER_VERBOSE
|
||||||
#define SERVER_VERBOSE 1
|
#define SERVER_VERBOSE 1
|
||||||
|
@ -27,14 +28,14 @@ extern bool server_verbose;
|
||||||
{ \
|
{ \
|
||||||
if (server_verbose) \
|
if (server_verbose) \
|
||||||
{ \
|
{ \
|
||||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
//
|
//
|
||||||
// parallel
|
// parallel
|
||||||
|
@ -50,7 +51,7 @@ enum task_type {
|
||||||
TASK_TYPE_COMPLETION,
|
TASK_TYPE_COMPLETION,
|
||||||
TASK_TYPE_CANCEL,
|
TASK_TYPE_CANCEL,
|
||||||
TASK_TYPE_NEXT_RESPONSE,
|
TASK_TYPE_NEXT_RESPONSE,
|
||||||
TASK_TYPE_SLOTS_DATA
|
TASK_TYPE_METRICS
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_server {
|
struct task_server {
|
||||||
|
@ -133,26 +134,48 @@ struct completion_token_output
|
||||||
std::string text_to_send;
|
std::string text_to_send;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line,
|
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
|
||||||
const char *message, const nlohmann::ordered_json &extra)
|
|
||||||
{
|
{
|
||||||
nlohmann::ordered_json log
|
std::stringstream ss_tid;
|
||||||
{
|
ss_tid << std::this_thread::get_id();
|
||||||
|
json log = nlohmann::ordered_json{
|
||||||
|
{"tid", ss_tid.str()},
|
||||||
{"timestamp", time(nullptr)},
|
{"timestamp", time(nullptr)},
|
||||||
{"level", level},
|
|
||||||
{"function", function},
|
|
||||||
{"line", line},
|
|
||||||
{"message", message},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!extra.empty())
|
if (server_log_json) {
|
||||||
{
|
log.merge_patch(
|
||||||
log.merge_patch(extra);
|
{
|
||||||
}
|
{"level", level},
|
||||||
|
{"function", function},
|
||||||
|
{"line", line},
|
||||||
|
{"msg", message},
|
||||||
|
});
|
||||||
|
if (!extra.empty()) {
|
||||||
|
log.merge_patch(extra);
|
||||||
|
}
|
||||||
|
|
||||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
||||||
printf("%.*s\n", (int)str.size(), str.data());
|
} else {
|
||||||
fflush(stdout);
|
char buf[1024];
|
||||||
|
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||||
|
|
||||||
|
if (!extra.empty()) {
|
||||||
|
log.merge_patch(extra);
|
||||||
|
}
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << buf << " |";
|
||||||
|
for (const auto& el : log.items())
|
||||||
|
{
|
||||||
|
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
|
snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
|
||||||
|
ss << buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string str = ss.str();
|
||||||
|
printf("%.*s\n", (int)str.size(), str.data());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -234,6 +257,7 @@ struct llama_server_queue {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
if (task.id == -1) {
|
if (task.id == -1) {
|
||||||
task.id = id++;
|
task.id = id++;
|
||||||
|
LOG_VERBOSE("new task id", {{"new_id", task.id}});
|
||||||
}
|
}
|
||||||
queue_tasks.push_back(std::move(task));
|
queue_tasks.push_back(std::move(task));
|
||||||
condition_tasks.notify_one();
|
condition_tasks.notify_one();
|
||||||
|
@ -249,7 +273,9 @@ struct llama_server_queue {
|
||||||
// Get the next id for creating anew task
|
// Get the next id for creating anew task
|
||||||
int get_new_id() {
|
int get_new_id() {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
return id++;
|
int new_id = id++;
|
||||||
|
LOG_VERBOSE("new task id", {{"new_id", new_id}});
|
||||||
|
return new_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register function to process a new task
|
// Register function to process a new task
|
||||||
|
@ -290,8 +316,7 @@ struct llama_server_queue {
|
||||||
void start_loop() {
|
void start_loop() {
|
||||||
running = true;
|
running = true;
|
||||||
while (true) {
|
while (true) {
|
||||||
// new task arrived
|
LOG_VERBOSE("new task may arrive", {});
|
||||||
LOG_VERBOSE("have new task", {});
|
|
||||||
{
|
{
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
@ -303,7 +328,7 @@ struct llama_server_queue {
|
||||||
task_server task = queue_tasks.front();
|
task_server task = queue_tasks.front();
|
||||||
queue_tasks.erase(queue_tasks.begin());
|
queue_tasks.erase(queue_tasks.begin());
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
LOG_VERBOSE("callback_new_task", {});
|
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
||||||
callback_new_task(task);
|
callback_new_task(task);
|
||||||
}
|
}
|
||||||
LOG_VERBOSE("callback_all_task_finished", {});
|
LOG_VERBOSE("callback_all_task_finished", {});
|
||||||
|
@ -384,11 +409,13 @@ struct llama_server_response {
|
||||||
std::condition_variable condition_results;
|
std::condition_variable condition_results;
|
||||||
|
|
||||||
void add_waiting_task_id(int task_id) {
|
void add_waiting_task_id(int task_id) {
|
||||||
|
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
waiting_task_ids.insert(task_id);
|
waiting_task_ids.insert(task_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void remove_waiting_task_id(int task_id) {
|
void remove_waiting_task_id(int task_id) {
|
||||||
|
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
waiting_task_ids.erase(task_id);
|
waiting_task_ids.erase(task_id);
|
||||||
}
|
}
|
||||||
|
@ -401,7 +428,6 @@ struct llama_server_response {
|
||||||
condition_results.wait(lock, [&]{
|
condition_results.wait(lock, [&]{
|
||||||
return !queue_results.empty();
|
return !queue_results.empty();
|
||||||
});
|
});
|
||||||
LOG_VERBOSE("condition_results unblock", {});
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||||
{
|
{
|
||||||
|
@ -426,22 +452,22 @@ struct llama_server_response {
|
||||||
// Send a new result to a waiting task_id
|
// Send a new result to a waiting task_id
|
||||||
void send(task_result result) {
|
void send(task_result result) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
LOG_VERBOSE("send new result", {});
|
LOG_VERBOSE("send new result", {{"task_id", result.id}});
|
||||||
for (auto& task_id : waiting_task_ids) {
|
for (auto& task_id : waiting_task_ids) {
|
||||||
// LOG_TEE("waiting task id %i \n", task_id);
|
// LOG_TEE("waiting task id %i \n", task_id);
|
||||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||||
if (result.multitask_id == task_id)
|
if (result.multitask_id == task_id)
|
||||||
{
|
{
|
||||||
LOG_VERBOSE("callback_update_multitask", {});
|
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
|
||||||
callback_update_multitask(task_id, result.id, result);
|
callback_update_multitask(task_id, result.id, result);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result.id == task_id)
|
if (result.id == task_id)
|
||||||
{
|
{
|
||||||
LOG_VERBOSE("queue_results.push_back", {});
|
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
|
||||||
queue_results.push_back(result);
|
queue_results.push_back(result);
|
||||||
condition_results.notify_one();
|
condition_results.notify_all();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -960,7 +960,7 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_opt_context * opt = train->opt;
|
struct ggml_opt_context * opt = train->opt;
|
||||||
|
|
||||||
// set opt params from command line
|
// set opt params from command line
|
||||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||||
opt->params.print_forward_graph = false;
|
opt->params.print_forward_graph = false;
|
||||||
opt->params.print_backward_graph = false;
|
opt->params.print_backward_graph = false;
|
||||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1708118438,
|
"lastModified": 1708655239,
|
||||||
"narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
|
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
|
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
138
ggml-cuda.cu
138
ggml-cuda.cu
|
@ -6369,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
|
||||||
int ixj = col ^ j;
|
int ixj = col ^ j;
|
||||||
if (ixj > col) {
|
if (ixj > col) {
|
||||||
if ((col & k) == 0) {
|
if ((col & k) == 0) {
|
||||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||||
swap(dst_row[col], dst_row[ixj]);
|
swap(dst_row[col], dst_row[ixj]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||||
swap(dst_row[col], dst_row[ixj]);
|
swap(dst_row[col], dst_row[ixj]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7927,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||||
|
|
||||||
const dim3 block_dims(ncols, 1, 1);
|
const dim3 block_dims(ncols, 1, 1);
|
||||||
const dim3 block_nums(1, nrows, 1);
|
const dim3 block_nums(1, nrows, 1);
|
||||||
if (order == GGML_SORT_ASC) {
|
if (order == GGML_SORT_ORDER_ASC) {
|
||||||
k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||||
} else if (order == GGML_SORT_DESC) {
|
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||||
k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -8362,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
|
|
||||||
cudaMemcpyKind kind;
|
cudaMemcpyKind kind;
|
||||||
char * src_ptr;
|
char * src_ptr;
|
||||||
if (src->backend == GGML_BACKEND_CPU) {
|
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
kind = cudaMemcpyHostToDevice;
|
kind = cudaMemcpyHostToDevice;
|
||||||
src_ptr = (char *) src->data;
|
src_ptr = (char *) src->data;
|
||||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
kind = cudaMemcpyDeviceToDevice;
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
|
@ -8771,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -8920,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -9096,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
const int compute_capability = g_device_caps[id].cc;
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
|
@ -9444,7 +9444,7 @@ static void ggml_cuda_op_soft_max(
|
||||||
const bool use_src2 = src2 != nullptr;
|
const bool use_src2 = src2 != nullptr;
|
||||||
|
|
||||||
if (use_src2) {
|
if (use_src2) {
|
||||||
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
if (src2_on_device) {
|
if (src2_on_device) {
|
||||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||||
|
@ -9502,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||||
|
|
||||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = nullptr;
|
float * src0_ddf = nullptr;
|
||||||
|
@ -9555,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9636,8 +9636,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
||||||
|
|
||||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||||
|
@ -9653,20 +9653,20 @@ static void ggml_cuda_op_mul_mat(
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||||
|
|
||||||
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||||
|
|
||||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
GGML_ASSERT(!(split && ne02 > 1));
|
GGML_ASSERT(!(split && ne02 > 1));
|
||||||
GGML_ASSERT(!(split && ne03 > 1));
|
GGML_ASSERT(!(split && ne03 > 1));
|
||||||
GGML_ASSERT(!(split && ne02 < ne12));
|
GGML_ASSERT(!(split && ne02 < ne12));
|
||||||
|
|
||||||
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
||||||
if (split) {
|
if (split) {
|
||||||
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
|
||||||
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
tensor_split = buft_ctx->tensor_split;
|
tensor_split = buft_ctx->tensor_split;
|
||||||
|
@ -9724,8 +9724,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
used_devices++;
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
cudaStream_t stream = g_cudaStreams[id][0];
|
cudaStream_t stream = g_cudaStreams[id][0];
|
||||||
|
@ -9776,8 +9776,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
||||||
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
|
@ -9802,12 +9802,12 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||||
// in that case an offset on dst_ddf_i is needed
|
// in that case an offset on dst_ddf_i is needed
|
||||||
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
||||||
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy src0, src1 to device if necessary
|
// copy src0, src1 to device if necessary
|
||||||
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||||
if (id != g_main_device) {
|
if (id != g_main_device) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
||||||
|
@ -9820,14 +9820,14 @@ static void ggml_cuda_op_mul_mat(
|
||||||
src1_ncols*ne10*sizeof(float), stream));
|
src1_ncols*ne10*sizeof(float), stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
@ -9845,10 +9845,10 @@ static void ggml_cuda_op_mul_mat(
|
||||||
if (!dst_on_device) {
|
if (!dst_on_device) {
|
||||||
void * dst_off_device;
|
void * dst_off_device;
|
||||||
cudaMemcpyKind kind;
|
cudaMemcpyKind kind;
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
dst_off_device = dst->data;
|
dst_off_device = dst->data;
|
||||||
kind = cudaMemcpyDeviceToHost;
|
kind = cudaMemcpyDeviceToHost;
|
||||||
} else if (dst->backend == GGML_BACKEND_GPU) {
|
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
dst_off_device = dst_extra->data_device[g_main_device];
|
dst_off_device = dst_extra->data_device[g_main_device];
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
kind = cudaMemcpyDeviceToDevice;
|
||||||
} else {
|
} else {
|
||||||
|
@ -9913,7 +9913,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
ggml_cuda_set_device(g_main_device);
|
ggml_cuda_set_device(g_main_device);
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
@ -10019,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
@ -10050,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -10109,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
@ -10255,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||||
(src1->backend == GGML_BACKEND_GPU) &&
|
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
||||||
( dst->backend == GGML_BACKEND_GPU);
|
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
|
|
||||||
|
@ -10409,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src00));
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
||||||
|
@ -10553,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
|
|
||||||
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_GPU) {
|
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
|
@ -10570,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
ggml_tensor src1_row = *src1;
|
ggml_tensor src1_row = *src1;
|
||||||
ggml_tensor dst_row = *dst;
|
ggml_tensor dst_row = *dst;
|
||||||
|
|
||||||
src1_row.backend = GGML_BACKEND_GPU;
|
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
dst_row.backend = GGML_BACKEND_GPU;
|
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
src1_row.extra = &src1_row_extra;
|
src1_row.extra = &src1_row_extra;
|
||||||
dst_row.extra = &dst_row_extra;
|
dst_row.extra = &dst_row_extra;
|
||||||
|
|
||||||
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
||||||
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
if (src1->ne[1] == 1) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
//int32_t row_id;
|
||||||
|
@ -10611,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
||||||
|
|
||||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||||
|
@ -10668,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10685,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
@ -10817,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -10966,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
func(tensor->src[0], tensor->src[1], tensor);
|
func(tensor->src[0], tensor->src[1], tensor);
|
||||||
|
@ -11072,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
|
|
||||||
extra->data_device[ctx->device] = tensor->data;
|
extra->data_device[ctx->device] = tensor->data;
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
@ -11087,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -11098,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -11333,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
||||||
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11605,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
}
|
}
|
||||||
|
@ -11614,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
}
|
}
|
||||||
|
@ -11644,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
ggml_compute_params params = {};
|
||||||
params.type = GGML_TASK_COMPUTE;
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||||
params.ith = 0;
|
params.ith = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
@ -11654,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||||
assert(node->extra != nullptr);
|
assert(node->extra != nullptr);
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||||
assert(node->src[j]->extra != nullptr);
|
assert(node->src[j]->extra != nullptr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2262,8 +2262,8 @@ static bool ggml_metal_graph_compute(
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
switch (order) {
|
switch (order) {
|
||||||
case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||||
case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ASSERT(false);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
||||||
if (tensor->backend != GGML_BACKEND_GPU) {
|
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||||
d_X = (cl_mem) src0->extra;
|
d_X = (cl_mem) src0->extra;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||||
}
|
}
|
||||||
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||||
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||||
|
|
||||||
size_t x_offset = 0;
|
size_t x_offset = 0;
|
||||||
|
|
||||||
|
@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
// TODO: copy src0 here when r3>1
|
// TODO: copy src0 here when r3>1
|
||||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
if (src0->backend == GGML_BACKEND_GPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||||
} else {
|
} else {
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
|
@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
if (src1->backend == GGML_BACKEND_CPU) {
|
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||||
}
|
}
|
||||||
|
@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_cl_pool_free(d_X, x_size);
|
ggml_cl_pool_free(d_X, x_size);
|
||||||
}
|
}
|
||||||
if (src1->backend != GGML_BACKEND_GPU) {
|
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_cl_pool_free(d_Y, y_size);
|
ggml_cl_pool_free(d_Y, y_size);
|
||||||
}
|
}
|
||||||
if (dst->backend != GGML_BACKEND_GPU) {
|
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_cl_pool_free(d_D, d_size);
|
ggml_cl_pool_free(d_D, d_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||||
d_X = (cl_mem) src0->extra;
|
d_X = (cl_mem) src0->extra;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||||
|
@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
// TODO: copy src0 here when r3>1
|
// TODO: copy src0 here when r3>1
|
||||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
if (src0->backend == GGML_BACKEND_GPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||||
} else {
|
} else {
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
|
@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host, then convert to float
|
// copy dst to host, then convert to float
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||||
|
@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_cl_pool_free(d_X, x_size);
|
ggml_cl_pool_free(d_X, x_size);
|
||||||
}
|
}
|
||||||
ggml_cl_pool_free(d_Y, y_size);
|
ggml_cl_pool_free(d_Y, y_size);
|
||||||
|
@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||||
cl_mem d_Q;
|
cl_mem d_Q;
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
// copy src0 to device if necessary
|
// copy src0 to device if necessary
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
d_Q = (cl_mem) src0->extra;
|
d_Q = (cl_mem) src0->extra;
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (!mul_mat_vec) {
|
if (!mul_mat_vec) {
|
||||||
// convert src0 to fp32 on device
|
// convert src0 to fp32 on device
|
||||||
const size_t global = x_ne / global_denom;
|
const size_t global = x_ne / global_denom;
|
||||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||||
|
@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
const size_t global = ne01 * local;
|
const size_t global = ne01 * local;
|
||||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||||
const cl_int ncols = ne00;
|
const cl_int ncols = ne00;
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||||
|
@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
}
|
}
|
||||||
ggml_cl_pool_free(d_Y, y_size);
|
ggml_cl_pool_free(d_Y, y_size);
|
||||||
ggml_cl_pool_free(d_D, d_size);
|
ggml_cl_pool_free(d_D, d_size);
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
ggml_cl_pool_free(d_Q, q_size);
|
ggml_cl_pool_free(d_Q, q_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
dst->type == GGML_TYPE_F32 &&
|
dst->type == GGML_TYPE_F32 &&
|
||||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||||
CL_CHECK(clFinish(queue));
|
CL_CHECK(clFinish(queue));
|
||||||
|
|
||||||
tensor->extra = dst;
|
tensor->extra = dst;
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml-backend
|
// ggml-backend
|
||||||
|
@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
ctx->sub_buffers.push_back(sub_buffer);
|
ctx->sub_buffers.push_back(sub_buffer);
|
||||||
tensor->extra = sub_buffer;
|
tensor->extra = sub_buffer;
|
||||||
}
|
}
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
|
|
@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: not tested
|
||||||
|
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||||
|
int8x16_t res;
|
||||||
|
|
||||||
|
res[ 0] = a[b[ 0]];
|
||||||
|
res[ 1] = a[b[ 1]];
|
||||||
|
res[ 2] = a[b[ 2]];
|
||||||
|
res[ 3] = a[b[ 3]];
|
||||||
|
res[ 4] = a[b[ 4]];
|
||||||
|
res[ 5] = a[b[ 5]];
|
||||||
|
res[ 6] = a[b[ 6]];
|
||||||
|
res[ 7] = a[b[ 7]];
|
||||||
|
res[ 8] = a[b[ 8]];
|
||||||
|
res[ 9] = a[b[ 9]];
|
||||||
|
res[10] = a[b[10]];
|
||||||
|
res[11] = a[b[11]];
|
||||||
|
res[12] = a[b[12]];
|
||||||
|
res[13] = a[b[13]];
|
||||||
|
res[14] = a[b[14]];
|
||||||
|
res[15] = a[b[15]];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define ggml_int16x8x2_t int16x8x2_t
|
#define ggml_int16x8x2_t int16x8x2_t
|
||||||
|
@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||||
|
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||||
qs += 16;
|
qs += 16;
|
||||||
|
|
||||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
||||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||||
|
|
||||||
|
@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||||
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
||||||
|
|
||||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
||||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||||
|
|
||||||
|
|
152
ggml-sycl.cpp
152
ggml-sycl.cpp
|
@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
||||||
|
|
||||||
size_t total_elements = ggml_nelements(src);
|
size_t total_elements = ggml_nelements(src);
|
||||||
|
|
||||||
const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
float *src_data =NULL;
|
float *src_data =NULL;
|
||||||
if(src_on_device) {
|
if(src_on_device) {
|
||||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
|
@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
||||||
int ixj = col ^ j;
|
int ixj = col ^ j;
|
||||||
if (ixj > col) {
|
if (ixj > col) {
|
||||||
if ((col & k) == 0) {
|
if ((col & k) == 0) {
|
||||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||||
swap(dst_row[col], dst_row[ixj]);
|
swap(dst_row[col], dst_row[ixj]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||||
swap(dst_row[col], dst_row[ixj]);
|
swap(dst_row[col], dst_row[ixj]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10825,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||||
|
|
||||||
const sycl::range<3> block_dims(1, 1, ncols);
|
const sycl::range<3> block_dims(1, 1, ncols);
|
||||||
const sycl::range<3> block_nums(1, nrows, 1);
|
const sycl::range<3> block_nums(1, nrows, 1);
|
||||||
if (order == GGML_SORT_ASC) {
|
if (order == GGML_SORT_ORDER_ASC) {
|
||||||
/*
|
/*
|
||||||
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
||||||
the limit. To get the device limit, query
|
the limit. To get the device limit, query
|
||||||
|
@ -10834,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
|
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
||||||
});
|
});
|
||||||
} else if (order == GGML_SORT_DESC) {
|
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||||
/*
|
/*
|
||||||
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
||||||
the limit. To get the device limit, query
|
the limit. To get the device limit, query
|
||||||
|
@ -10845,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
|
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -11407,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||||
|
|
||||||
dpct::memcpy_direction kind;
|
dpct::memcpy_direction kind;
|
||||||
char * src_ptr;
|
char * src_ptr;
|
||||||
if (src->backend == GGML_BACKEND_CPU) {
|
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
kind = dpct::host_to_device;
|
kind = dpct::host_to_device;
|
||||||
src_ptr = (char *) src->data;
|
src_ptr = (char *) src->data;
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
|
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
||||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||||
kind = dpct::device_to_device;
|
kind = dpct::device_to_device;
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
|
@ -11846,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
||||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -12119,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
#ifdef GGML_SYCL_F16
|
#ifdef GGML_SYCL_F16
|
||||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||||
|
@ -12501,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||||
|
|
||||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = nullptr;
|
float * src0_ddf = nullptr;
|
||||||
|
@ -12565,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||||
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
dpct::get_current_device().queues_wait_and_throw()));
|
dpct::get_current_device().queues_wait_and_throw()));
|
||||||
}
|
}
|
||||||
|
@ -12640,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
|
||||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||||
|
|
||||||
|
@ -12656,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||||
|
|
||||||
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||||
|
|
||||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
GGML_ASSERT(!(split && ne02 > 1));
|
GGML_ASSERT(!(split && ne02 > 1));
|
||||||
GGML_ASSERT(!(split && ne03 > 1));
|
GGML_ASSERT(!(split && ne03 > 1));
|
||||||
GGML_ASSERT(!(split && ne02 < ne12));
|
GGML_ASSERT(!(split && ne02 < ne12));
|
||||||
|
@ -12717,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
|
|
||||||
used_devices++;
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||||
|
|
||||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||||
|
@ -12782,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||||
const int64_t row_diff = row_high[id] - row_low[id];
|
const int64_t row_diff = row_high[id] - row_low[id];
|
||||||
|
|
||||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||||
|
@ -12809,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
|
|
||||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||||
// in that case an offset on dst_ddf_i is needed
|
// in that case an offset on dst_ddf_i is needed
|
||||||
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
|
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
||||||
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy src0, src1 to device if necessary
|
// copy src0, src1 to device if necessary
|
||||||
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||||
if (id != g_main_device_index) {
|
if (id != g_main_device_index) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
||||||
|
@ -12830,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
src1_ncols * ne10 * sizeof(float))));
|
src1_ncols * ne10 * sizeof(float))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||||
/*
|
/*
|
||||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||||
|
@ -12867,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
if (!dst_on_device) {
|
if (!dst_on_device) {
|
||||||
void * dst_off_device;
|
void * dst_off_device;
|
||||||
dpct::memcpy_direction kind;
|
dpct::memcpy_direction kind;
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
dst_off_device = dst->data;
|
dst_off_device = dst->data;
|
||||||
kind = dpct::device_to_host;
|
kind = dpct::device_to_host;
|
||||||
} else if (dst->backend == GGML_BACKEND_GPU) {
|
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
dst_off_device = dst_extra->data_device[g_main_device_index];
|
dst_off_device = dst_extra->data_device[g_main_device_index];
|
||||||
kind = dpct::device_to_device;
|
kind = dpct::device_to_device;
|
||||||
} else {
|
} else {
|
||||||
|
@ -12954,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
dpct::get_current_device().queues_wait_and_throw()));
|
dpct::get_current_device().queues_wait_and_throw()));
|
||||||
|
@ -13091,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1,
|
const ggml_tensor *src1,
|
||||||
ggml_tensor *dst) try {
|
ggml_tensor *dst) try {
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
@ -13129,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -13196,7 +13196,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -13372,11 +13372,11 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||||
(src1->backend == GGML_BACKEND_GPU) &&
|
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
||||||
( dst->backend == GGML_BACKEND_GPU);
|
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
@ -13505,7 +13505,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src00));
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
||||||
|
@ -13643,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_GPU) {
|
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||||
|
@ -13661,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
ggml_tensor src1_row = *src1;
|
ggml_tensor src1_row = *src1;
|
||||||
ggml_tensor dst_row = *dst;
|
ggml_tensor dst_row = *dst;
|
||||||
|
|
||||||
src1_row.backend = GGML_BACKEND_GPU;
|
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
dst_row.backend = GGML_BACKEND_GPU;
|
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
src1_row.extra = &src1_row_extra;
|
src1_row.extra = &src1_row_extra;
|
||||||
dst_row.extra = &dst_row_extra;
|
dst_row.extra = &dst_row_extra;
|
||||||
|
|
||||||
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
||||||
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
if (src1->ne[1] == 1) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
//int32_t row_id;
|
||||||
|
@ -13756,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13779,8 +13779,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
@ -13887,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
|
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||||
|
|
||||||
int64_t row_low, row_high;
|
int64_t row_low, row_high;
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
row_low = 0;
|
row_low = 0;
|
||||||
row_high = nrows;
|
row_high = nrows;
|
||||||
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
const int64_t rounding = get_row_rounding(tensor->type);
|
const int64_t rounding = get_row_rounding(tensor->type);
|
||||||
|
|
||||||
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
||||||
|
@ -13946,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||||
|
|
||||||
extra->data_device[id] = buf;
|
extra->data_device[id] = buf;
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
||||||
new sycl::event()));
|
new sycl::event()));
|
||||||
|
@ -13963,7 +13963,7 @@ catch (sycl::exception const &exc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
||||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14016,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
const ggml_op src0_op = tensor->src[0]->op;
|
const ggml_op src0_op = tensor->src[0]->op;
|
||||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||||
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14042,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||||
|
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
@ -14111,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
||||||
|
|
||||||
const bool inplace = tensor->view_src != nullptr;
|
const bool inplace = tensor->view_src != nullptr;
|
||||||
|
|
||||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||||
size_t view_offset = 0;
|
size_t view_offset = 0;
|
||||||
|
@ -14132,7 +14132,7 @@ catch (sycl::exception const &exc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
@ -14219,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
if (!g_sycl_loaded) return false;
|
if (!g_sycl_loaded) return false;
|
||||||
|
|
||||||
ggml_sycl_func_t func;
|
ggml_sycl_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -14359,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||||
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
func(tensor->src[0], tensor->src[1], tensor);
|
func(tensor->src[0], tensor->src[1], tensor);
|
||||||
|
@ -14517,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
|
|
||||||
extra->data_device[ctx->device] = tensor->data;
|
extra->data_device[ctx->device] = tensor->data;
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
@ -14548,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_tensor *tensor,
|
ggml_tensor *tensor,
|
||||||
const void *data, size_t offset,
|
const void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -14573,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||||
const ggml_tensor *tensor,
|
const ggml_tensor *tensor,
|
||||||
void *data, size_t offset,
|
void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -14809,7 +14809,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||||
(char *)tensor->data + offset, data, size)));
|
(char *)tensor->data + offset, data, size)));
|
||||||
|
@ -14827,7 +14827,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||||
data, (const char *)tensor->data + offset, size)));
|
data, (const char *)tensor->data + offset, size)));
|
||||||
|
@ -14880,7 +14880,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
ggml_compute_params params = {};
|
||||||
params.type = GGML_TASK_COMPUTE;
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||||
params.ith = 0;
|
params.ith = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
@ -14888,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
assert(node->backend == GGML_BACKEND_GPU);
|
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||||
assert(node->extra != nullptr);
|
assert(node->extra != nullptr);
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||||
assert(node->src[j]->extra != nullptr);
|
assert(node->src[j]->extra != nullptr);
|
||||||
}
|
}
|
||||||
|
|
102
ggml-vulkan.cpp
102
ggml-vulkan.cpp
|
@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||||
src1_uma = d_Qy != nullptr;
|
src1_uma = d_Qy != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||||
|
|
||||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||||
|
@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||||
// compute
|
// compute
|
||||||
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data);
|
float * d = (float *) ((char *) dst->data);
|
||||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
||||||
|
@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||||
src1_uma = d_Qy != nullptr;
|
src1_uma = d_Qy != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||||
|
|
||||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||||
|
@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
||||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
src1_uma = d_Qy != nullptr;
|
src1_uma = d_Qy != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||||
|
|
||||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||||
|
@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) dst->data;
|
float * d = (float *) dst->data;
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
src1_uma = d_Qy != nullptr;
|
src1_uma = d_Qy != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||||
|
|
||||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||||
|
|
||||||
|
@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) dst->data;
|
float * d = (float *) dst->data;
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
||||||
dst->type == GGML_TYPE_F32 &&
|
dst->type == GGML_TYPE_F32 &&
|
||||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
|
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
|
@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||||
// TODO: support for transposed / permuted tensors
|
// TODO: support for transposed / permuted tensors
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
|
@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||||
|
|
||||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
||||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||||
|
@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||||
|
|
||||||
// Workaround for tiny tensor inputs on ROPE
|
// Workaround for tiny tensor inputs on ROPE
|
||||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
|
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
||||||
y_sz = VK_WHOLE_SIZE;
|
y_sz = VK_WHOLE_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
}
|
}
|
||||||
if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
||||||
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
||||||
} else if(dst->backend == GGML_BACKEND_CPU) {
|
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) dst->data;
|
float * d = (float *) dst->data;
|
||||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
||||||
|
@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
}
|
}
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
||||||
}
|
}
|
||||||
|
@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||||
|
|
||||||
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
// If backend is CPU, data from src0 has to be copied off the device
|
// If backend is CPU, data from src0 has to be copied off the device
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||||
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
|
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
||||||
|
|
||||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
||||||
return;
|
return;
|
||||||
|
@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
||||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||||
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
|
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
||||||
return;
|
return;
|
||||||
|
@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
last_node = true;
|
last_node = true;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (node->backend == GGML_BACKEND_CPU || last_node) {
|
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
||||||
ggml_vk_ctx_end(ctx->compute_ctx);
|
ggml_vk_ctx_end(ctx->compute_ctx);
|
||||||
ctx->compute_ctx->exit_tensor = node;
|
ctx->compute_ctx->exit_tensor = node;
|
||||||
ctx->compute_ctx = nullptr;
|
ctx->compute_ctx = nullptr;
|
||||||
|
@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
||||||
#endif
|
#endif
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
|
||||||
|
@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
||||||
#endif
|
#endif
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
|
||||||
|
@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||||
int last_node = cgraph->n_nodes - 1;
|
int last_node = cgraph->n_nodes - 1;
|
||||||
|
|
||||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
||||||
last_node -= 1;
|
last_node -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
ggml_compute_params params = {};
|
||||||
params.type = GGML_TASK_COMPUTE;
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||||
params.ith = 0;
|
params.ith = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||||
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
||||||
void * tensor_data = tensor->data;
|
void * tensor_data = tensor->data;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
const size_t tensor_size = ggml_nbytes(tensor);
|
const size_t tensor_size = ggml_nbytes(tensor);
|
||||||
tensor_data = malloc(tensor_size);
|
tensor_data = malloc(tensor_size);
|
||||||
|
|
||||||
|
@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||||
std::vector<const ggml_tensor *> done;
|
std::vector<const ggml_tensor *> done;
|
||||||
ggml_vk_print_graph_origin(tensor, done);
|
ggml_vk_print_graph_origin(tensor, done);
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
free(tensor_data);
|
free(tensor_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||||
return;
|
return;
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
||||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
src0_buffer = malloc(src0_size);
|
src0_buffer = malloc(src0_size);
|
||||||
src0_clone->data = src0_buffer;
|
src0_clone->data = src0_buffer;
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
memcpy(src0_clone->data, src0->data, src0_size);
|
memcpy(src0_clone->data, src0->data, src0_size);
|
||||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
uint64_t offset = extra->offset;
|
uint64_t offset = extra->offset;
|
||||||
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
||||||
|
@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
src1_buffer = malloc(src1_size);
|
src1_buffer = malloc(src1_size);
|
||||||
src1_clone->data = src1_buffer;
|
src1_clone->data = src1_buffer;
|
||||||
if (src1->backend == GGML_BACKEND_CPU) {
|
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||||
memcpy(src1_clone->data, src1->data, src1_size);
|
memcpy(src1_clone->data, src1->data, src1_size);
|
||||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
} else if (src1->backend == GGML_BACKEND_GPU) {
|
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
uint64_t offset = extra->offset;
|
uint64_t offset = extra->offset;
|
||||||
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
||||||
|
@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
||||||
|
@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
void * tensor_data = tensor->data;
|
void * tensor_data = tensor->data;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
size_t tensor_size = ggml_nbytes(tensor);
|
size_t tensor_size = ggml_nbytes(tensor);
|
||||||
tensor_data = malloc(tensor_size);
|
tensor_data = malloc(tensor_size);
|
||||||
|
|
||||||
|
@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
comp_result = nullptr;
|
comp_result = nullptr;
|
||||||
comp_size = 0;
|
comp_size = 0;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
free(tensor_data);
|
free(tensor_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
38
ggml.h
38
ggml.h
|
@ -364,9 +364,9 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_backend_type {
|
enum ggml_backend_type {
|
||||||
GGML_BACKEND_CPU = 0,
|
GGML_BACKEND_TYPE_CPU = 0,
|
||||||
GGML_BACKEND_GPU = 10,
|
GGML_BACKEND_TYPE_GPU = 10,
|
||||||
GGML_BACKEND_GPU_SPLIT = 20,
|
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
|
@ -498,9 +498,9 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_object_type {
|
enum ggml_object_type {
|
||||||
GGML_OBJECT_TENSOR,
|
GGML_OBJECT_TYPE_TENSOR,
|
||||||
GGML_OBJECT_GRAPH,
|
GGML_OBJECT_TYPE_GRAPH,
|
||||||
GGML_OBJECT_WORK_BUFFER
|
GGML_OBJECT_TYPE_WORK_BUFFER
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_log_level {
|
enum ggml_log_level {
|
||||||
|
@ -642,9 +642,9 @@ extern "C" {
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
||||||
enum ggml_task_type {
|
enum ggml_task_type {
|
||||||
GGML_TASK_INIT = 0,
|
GGML_TASK_TYPE_INIT = 0,
|
||||||
GGML_TASK_COMPUTE,
|
GGML_TASK_TYPE_COMPUTE,
|
||||||
GGML_TASK_FINALIZE,
|
GGML_TASK_TYPE_FINALIZE,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_compute_params {
|
struct ggml_compute_params {
|
||||||
|
@ -1649,8 +1649,8 @@ extern "C" {
|
||||||
|
|
||||||
// sort rows
|
// sort rows
|
||||||
enum ggml_sort_order {
|
enum ggml_sort_order {
|
||||||
GGML_SORT_ASC,
|
GGML_SORT_ORDER_ASC,
|
||||||
GGML_SORT_DESC,
|
GGML_SORT_ORDER_DESC,
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_argsort(
|
GGML_API struct ggml_tensor * ggml_argsort(
|
||||||
|
@ -1943,8 +1943,8 @@ extern "C" {
|
||||||
|
|
||||||
// optimization methods
|
// optimization methods
|
||||||
enum ggml_opt_type {
|
enum ggml_opt_type {
|
||||||
GGML_OPT_ADAM,
|
GGML_OPT_TYPE_ADAM,
|
||||||
GGML_OPT_LBFGS,
|
GGML_OPT_TYPE_LBFGS,
|
||||||
};
|
};
|
||||||
|
|
||||||
// linesearch methods
|
// linesearch methods
|
||||||
|
@ -1958,12 +1958,12 @@ extern "C" {
|
||||||
|
|
||||||
// optimization return values
|
// optimization return values
|
||||||
enum ggml_opt_result {
|
enum ggml_opt_result {
|
||||||
GGML_OPT_OK = 0,
|
GGML_OPT_RESULT_OK = 0,
|
||||||
GGML_OPT_DID_NOT_CONVERGE,
|
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
||||||
GGML_OPT_NO_CONTEXT,
|
GGML_OPT_RESULT_NO_CONTEXT,
|
||||||
GGML_OPT_INVALID_WOLFE,
|
GGML_OPT_RESULT_INVALID_WOLFE,
|
||||||
GGML_OPT_FAIL,
|
GGML_OPT_RESULT_FAIL,
|
||||||
GGML_OPT_CANCEL,
|
GGML_OPT_RESULT_CANCEL,
|
||||||
|
|
||||||
GGML_LINESEARCH_FAIL = -128,
|
GGML_LINESEARCH_FAIL = -128,
|
||||||
GGML_LINESEARCH_MINIMUM_STEP,
|
GGML_LINESEARCH_MINIMUM_STEP,
|
||||||
|
|
62
llama.h
62
llama.h
|
@ -64,6 +64,15 @@ extern "C" {
|
||||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||||
|
enum llama_rope_type {
|
||||||
|
LLAMA_ROPE_TYPE_NONE = -1,
|
||||||
|
LLAMA_ROPE_TYPE_NORM = 0,
|
||||||
|
LLAMA_ROPE_TYPE_NEOX = 2,
|
||||||
|
LLAMA_ROPE_TYPE_GLM = 4,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_token_type {
|
enum llama_token_type {
|
||||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||||
|
@ -109,23 +118,23 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_scaling_type {
|
enum llama_rope_scaling_type {
|
||||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
||||||
LLAMA_ROPE_SCALING_NONE = 0,
|
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
||||||
LLAMA_ROPE_SCALING_LINEAR = 1,
|
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
||||||
LLAMA_ROPE_SCALING_YARN = 2,
|
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_pooling_type {
|
enum llama_pooling_type {
|
||||||
LLAMA_POOLING_NONE = 0,
|
LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
LLAMA_POOLING_MEAN = 1,
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
LLAMA_POOLING_CLS = 2,
|
LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
|
@ -173,9 +182,9 @@ extern "C" {
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
enum llama_model_kv_override_type {
|
enum llama_model_kv_override_type {
|
||||||
LLAMA_KV_OVERRIDE_INT,
|
LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||||
LLAMA_KV_OVERRIDE_FLOAT,
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||||
LLAMA_KV_OVERRIDE_BOOL,
|
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model_kv_override {
|
struct llama_model_kv_override {
|
||||||
|
@ -360,6 +369,7 @@ extern "C" {
|
||||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||||
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||||
|
|
||||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||||
|
@ -514,10 +524,12 @@ extern "C" {
|
||||||
llama_seq_id seq_id);
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||||
|
// - lazily on next llama_decode()
|
||||||
|
// - explicitly with llama_kv_cache_update()
|
||||||
// p0 < 0 : [0, p1]
|
// p0 < 0 : [0, p1]
|
||||||
// p1 < 0 : [p0, inf)
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_shift(
|
LLAMA_API void llama_kv_cache_seq_add(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
llama_pos p0,
|
llama_pos p0,
|
||||||
|
@ -525,7 +537,9 @@ extern "C" {
|
||||||
llama_pos delta);
|
llama_pos delta);
|
||||||
|
|
||||||
// Integer division of the positions by factor of `d > 1`
|
// Integer division of the positions by factor of `d > 1`
|
||||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||||
|
// - lazily on next llama_decode()
|
||||||
|
// - explicitly with llama_kv_cache_update()
|
||||||
// p0 < 0 : [0, p1]
|
// p0 < 0 : [0, p1]
|
||||||
// p1 < 0 : [p0, inf)
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_div(
|
LLAMA_API void llama_kv_cache_seq_div(
|
||||||
|
@ -535,6 +549,20 @@ extern "C" {
|
||||||
llama_pos p1,
|
llama_pos p1,
|
||||||
int d);
|
int d);
|
||||||
|
|
||||||
|
// Returns the largest position present in the KV cache for the specified sequence
|
||||||
|
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
|
// Defragment the KV cache
|
||||||
|
// This will be applied:
|
||||||
|
// - lazily on next llama_decode()
|
||||||
|
// - explicitly with llama_kv_cache_update()
|
||||||
|
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||||
|
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// State / sessions
|
// State / sessions
|
||||||
//
|
//
|
||||||
|
|
|
@ -1264,7 +1264,7 @@ struct test_argsort : public test_case {
|
||||||
|
|
||||||
test_argsort(ggml_type type = GGML_TYPE_F32,
|
test_argsort(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
||||||
ggml_sort_order order = GGML_SORT_ASC)
|
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
||||||
: type(type), ne(ne), order(order) {}
|
: type(type), ne(ne), order(order) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
|
@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||||
|
|
||||||
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,7 +118,7 @@ int main(void) {
|
||||||
const float fe = ggml_get_f32_1d(e, 0);
|
const float fe = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: e = %.4f\n", __func__, fe);
|
printf("%s: e = %.4f\n", __func__, fe);
|
||||||
|
|
||||||
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
|
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||||
|
|
||||||
ggml_opt(ctx, opt_params, e);
|
ggml_opt(ctx, opt_params, e);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue