Merge branch 'master' into server-ui-pr
This commit is contained in:
commit
734be4dcc9
35 changed files with 1631 additions and 583 deletions
2
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
2
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
|
@ -1,4 +1,4 @@
|
||||||
name: Enhancement template
|
name: Enhancement
|
||||||
description: Used to request enhancements for llama.cpp
|
description: Used to request enhancements for llama.cpp
|
||||||
title: "Feature Request: "
|
title: "Feature Request: "
|
||||||
labels: ["enhancement"]
|
labels: ["enhancement"]
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/06-question.yml
vendored
2
.github/ISSUE_TEMPLATE/06-question.yml
vendored
|
@ -1,4 +1,4 @@
|
||||||
name: Question template
|
name: Question
|
||||||
description: Used to ask questions about llama.cpp
|
description: Used to ask questions about llama.cpp
|
||||||
title: "Question: "
|
title: "Question: "
|
||||||
labels: ["question"]
|
labels: ["question"]
|
||||||
|
|
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
name: Refactor (Maintainers)
|
||||||
|
description: Used to track refactoring opportunities
|
||||||
|
title: "Refactor: "
|
||||||
|
labels: ["refactor"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||||
|
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background-description
|
||||||
|
attributes:
|
||||||
|
label: Background Description
|
||||||
|
description: Please provide a detailed written description of the pain points you are trying to solve.
|
||||||
|
placeholder: Detailed description behind your motivation to request refactor
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-approaches
|
||||||
|
attributes:
|
||||||
|
label: Possible Refactor Approaches
|
||||||
|
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
||||||
|
placeholder: Your idea of possible refactoring opportunity/approaches
|
||||||
|
validations:
|
||||||
|
required: false
|
|
@ -628,6 +628,10 @@ if (LLAMA_SYCL)
|
||||||
add_compile_definitions(GGML_SYCL_F16)
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
{
|
{
|
||||||
"version": 4,
|
"version": 4,
|
||||||
"configurePresets": [
|
"configurePresets": [
|
||||||
{
|
{
|
||||||
|
@ -40,6 +40,10 @@
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||||
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,8 +55,8 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
| OS | Status | Verified |
|
| OS | Status | Verified |
|
||||||
|---------|---------|------------------------------------|
|
|---------|---------|------------------------------------------------|
|
||||||
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
|
||||||
| Windows | Support | Windows 11 |
|
| Windows | Support | Windows 11 |
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|-------------------------------|---------|---------------------------------------|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
| Intel Arc Series | Support | Arc 770, 730M |
|
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
|
||||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||||
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
||||||
|
|
||||||
|
|
|
@ -478,6 +478,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
|
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
|
|
@ -1317,6 +1317,17 @@ class LlamaModel(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
@ -1331,9 +1342,9 @@ class LlamaModel(Model):
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
|
@ -2620,6 +2631,85 @@ class ArcticModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
class DeepseekV2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||||
|
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("mlp.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -178,6 +178,7 @@ struct cmd_params {
|
||||||
std::vector<ggml_type> type_v;
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
|
std::vector<std::string> rpc_servers;
|
||||||
std::vector<llama_split_mode> split_mode;
|
std::vector<llama_split_mode> split_mode;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
std::vector<bool> no_kv_offload;
|
std::vector<bool> no_kv_offload;
|
||||||
|
@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {cpu_get_num_math()},
|
/* n_threads */ {cpu_get_num_math()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
|
/* rpc_servers */ {""},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
/* no_kv_offload */ {false},
|
/* no_kv_offload */ {false},
|
||||||
|
@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
|
@ -384,6 +387,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-rpc" || arg == "--rpc") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rpc_servers.push_back(argv[i]);
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -519,6 +528,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||||
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||||
|
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
|
||||||
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||||
|
@ -541,6 +551,7 @@ struct cmd_params_instance {
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
|
std::string rpc_servers;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
|
@ -553,6 +564,9 @@ struct cmd_params_instance {
|
||||||
llama_model_params mparams = llama_model_default_params();
|
llama_model_params mparams = llama_model_default_params();
|
||||||
|
|
||||||
mparams.n_gpu_layers = n_gpu_layers;
|
mparams.n_gpu_layers = n_gpu_layers;
|
||||||
|
if (!rpc_servers.empty()) {
|
||||||
|
mparams.rpc_servers = rpc_servers.c_str();
|
||||||
|
}
|
||||||
mparams.split_mode = split_mode;
|
mparams.split_mode = split_mode;
|
||||||
mparams.main_gpu = main_gpu;
|
mparams.main_gpu = main_gpu;
|
||||||
mparams.tensor_split = tensor_split.data();
|
mparams.tensor_split = tensor_split.data();
|
||||||
|
@ -564,6 +578,7 @@ struct cmd_params_instance {
|
||||||
bool equal_mparams(const cmd_params_instance & other) const {
|
bool equal_mparams(const cmd_params_instance & other) const {
|
||||||
return model == other.model &&
|
return model == other.model &&
|
||||||
n_gpu_layers == other.n_gpu_layers &&
|
n_gpu_layers == other.n_gpu_layers &&
|
||||||
|
rpc_servers == other.rpc_servers &&
|
||||||
split_mode == other.split_mode &&
|
split_mode == other.split_mode &&
|
||||||
main_gpu == other.main_gpu &&
|
main_gpu == other.main_gpu &&
|
||||||
use_mmap == other.use_mmap &&
|
use_mmap == other.use_mmap &&
|
||||||
|
@ -592,6 +607,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
// this ordering minimizes the number of times that each model needs to be reloaded
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
||||||
for (const auto & m : params.model)
|
for (const auto & m : params.model)
|
||||||
for (const auto & nl : params.n_gpu_layers)
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
|
for (const auto & rpc : params.rpc_servers)
|
||||||
for (const auto & sm : params.split_mode)
|
for (const auto & sm : params.split_mode)
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
|
@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
|
@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
|
@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
|
@ -692,6 +711,7 @@ struct test {
|
||||||
static const bool kompute;
|
static const bool kompute;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
static const bool sycl;
|
static const bool sycl;
|
||||||
|
static const bool rpc;
|
||||||
static const bool gpu_blas;
|
static const bool gpu_blas;
|
||||||
static const bool blas;
|
static const bool blas;
|
||||||
static const std::string cpu_info;
|
static const std::string cpu_info;
|
||||||
|
@ -790,6 +810,9 @@ struct test {
|
||||||
if (sycl) {
|
if (sycl) {
|
||||||
return GGML_SYCL_NAME;
|
return GGML_SYCL_NAME;
|
||||||
}
|
}
|
||||||
|
if (rpc) {
|
||||||
|
return "RPC";
|
||||||
|
}
|
||||||
if (gpu_blas) {
|
if (gpu_blas) {
|
||||||
return "GPU BLAS";
|
return "GPU BLAS";
|
||||||
}
|
}
|
||||||
|
@ -803,7 +826,7 @@ struct test {
|
||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
static const std::vector<std::string> fields = {
|
static const std::vector<std::string> fields = {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
|
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
|
@ -859,7 +882,7 @@ struct test {
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
||||||
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
|
@ -894,6 +917,7 @@ const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||||
const bool test::blas = !!ggml_cpu_has_blas();
|
const bool test::blas = !!ggml_cpu_has_blas();
|
||||||
const bool test::sycl = !!ggml_cpu_has_sycl();
|
const bool test::sycl = !!ggml_cpu_has_sycl();
|
||||||
|
const bool test::rpc = !!ggml_cpu_has_rpc();
|
||||||
const std::string test::cpu_info = get_cpu_info();
|
const std::string test::cpu_info = get_cpu_info();
|
||||||
const std::string test::gpu_info = get_gpu_info();
|
const std::string test::gpu_info = get_gpu_info();
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
|
@ -594,7 +594,7 @@
|
||||||
message = html`<${Probabilities} data=${data} />`
|
message = html`<${Probabilities} data=${data} />`
|
||||||
} else {
|
} else {
|
||||||
const text = isArrayMessage ?
|
const text = isArrayMessage ?
|
||||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
data.map(msg => msg.content).join('') :
|
||||||
data;
|
data;
|
||||||
message = isCompletionMode ?
|
message = isCompletionMode ?
|
||||||
text :
|
text :
|
||||||
|
@ -877,7 +877,11 @@
|
||||||
|
|
||||||
// poor mans markdown replacement
|
// poor mans markdown replacement
|
||||||
const Markdownish = (params) => {
|
const Markdownish = (params) => {
|
||||||
const md = params.text
|
const chunks = params.text.split('```');
|
||||||
|
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
if (i % 2 === 0) { // outside code block
|
||||||
|
chunks[i] = chunks[i]
|
||||||
.replace(/&/g, '&')
|
.replace(/&/g, '&')
|
||||||
.replace(/</g, '<')
|
.replace(/</g, '<')
|
||||||
.replace(/>/g, '>')
|
.replace(/>/g, '>')
|
||||||
|
@ -889,7 +893,14 @@
|
||||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||||
.replace(/\n/gim, '<br />');
|
.replace(/\n/gim, '<br />');
|
||||||
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
} else { // inside code block
|
||||||
|
chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const restoredText = chunks.join('');
|
||||||
|
|
||||||
|
return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const ModelGenerationInfo = (params) => {
|
const ModelGenerationInfo = (params) => {
|
||||||
|
@ -903,6 +914,7 @@
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// simple popover impl
|
// simple popover impl
|
||||||
const Popover = (props) => {
|
const Popover = (props) => {
|
||||||
const isOpen = useSignal(false);
|
const isOpen = useSignal(false);
|
||||||
|
@ -1054,4 +1066,3 @@
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
|
|
24
ggml-cuda.cu
24
ggml-cuda.cu
|
@ -119,6 +119,20 @@ int ggml_cuda_get_device() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
|
ggml_cuda_set_device(device);
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||||
|
auto res = hipMallocManaged(ptr, size);
|
||||||
|
if (res == hipSuccess) {
|
||||||
|
// if error we "need" to know why...
|
||||||
|
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
#else
|
||||||
|
return cudaMalloc(ptr, size);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_cuda_device_info ggml_cuda_init() {
|
static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||||
|
@ -271,7 +285,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
pool_size += look_ahead_size;
|
pool_size += look_ahead_size;
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
|
@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
||||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||||
|
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
|
@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
||||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
char * buf;
|
char * buf;
|
||||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||||
|
|
||||||
// set padding to 0 to avoid possible NaN values
|
// set padding to 0 to avoid possible NaN values
|
||||||
if (size > original_size) {
|
if (size > original_size) {
|
||||||
|
@ -1856,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
// use cublasGemmStridedBatchedEx
|
// use cublasGemmStridedBatchedEx
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
|
@ -2872,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
|
|
@ -79,13 +79,8 @@
|
||||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
#define cudaHostUnregister hipHostUnregister
|
#define cudaHostUnregister hipHostUnregister
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
#ifdef GGML_HIP_UMA
|
|
||||||
#define cudaMalloc hipMallocManaged
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
|
||||||
#else
|
|
||||||
#define cudaMalloc hipMalloc
|
#define cudaMalloc hipMalloc
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
#endif
|
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
|
|
@ -1,15 +1,69 @@
|
||||||
#include "concat.cuh"
|
#include "concat.cuh"
|
||||||
|
|
||||||
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
// contiguous kernels
|
||||||
|
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (nidx >= ne0) {
|
if (nidx >= ne0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// operation
|
|
||||||
int offset_dst =
|
int offset_dst =
|
||||||
nidx +
|
nidx +
|
||||||
blockIdx.y * ne0 +
|
blockIdx.y * ne0 +
|
||||||
blockIdx.z * ne0 * gridDim.y;
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
|
if (nidx < ne00) { // src0
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne00 +
|
||||||
|
blockIdx.z * ne00 * gridDim.y;
|
||||||
|
dst[offset_dst] = x[offset_src];
|
||||||
|
} else {
|
||||||
|
int offset_src =
|
||||||
|
(nidx - ne00) +
|
||||||
|
blockIdx.y * (ne0 - ne00) +
|
||||||
|
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||||
|
dst[offset_dst] = y[offset_src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset_dst =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
|
if (blockIdx.y < ne01) { // src0
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * ne01;
|
||||||
|
dst[offset_dst] = x[offset_src];
|
||||||
|
} else {
|
||||||
|
int offset_src =
|
||||||
|
nidx +
|
||||||
|
(blockIdx.y - ne01) * ne0 +
|
||||||
|
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||||
|
dst[offset_dst] = y[offset_src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||||
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (nidx >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset_dst =
|
||||||
|
nidx +
|
||||||
|
blockIdx.y * ne0 +
|
||||||
|
blockIdx.z * ne0 * gridDim.y;
|
||||||
|
|
||||||
if (blockIdx.z < ne02) { // src0
|
if (blockIdx.z < ne02) { // src0
|
||||||
int offset_src =
|
int offset_src =
|
||||||
nidx +
|
nidx +
|
||||||
|
@ -25,25 +79,118 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||||
dim3 gridDim(num_blocks, ne1, ne2);
|
dim3 gridDim(num_blocks, ne1, ne2);
|
||||||
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
if (dim == 0) {
|
||||||
|
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
if (dim == 1) {
|
||||||
|
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||||
|
}
|
||||||
|
|
||||||
|
// non-contiguous kernel (slow)
|
||||||
|
static __global__ void concat_f32_non_cont(
|
||||||
|
const char * src0,
|
||||||
|
const char * src1,
|
||||||
|
char * dst,
|
||||||
|
int64_t ne00,
|
||||||
|
int64_t ne01,
|
||||||
|
int64_t ne02,
|
||||||
|
int64_t ne03,
|
||||||
|
uint64_t nb00,
|
||||||
|
uint64_t nb01,
|
||||||
|
uint64_t nb02,
|
||||||
|
uint64_t nb03,
|
||||||
|
int64_t /*ne10*/,
|
||||||
|
int64_t /*ne11*/,
|
||||||
|
int64_t /*ne12*/,
|
||||||
|
int64_t /*ne13*/,
|
||||||
|
uint64_t nb10,
|
||||||
|
uint64_t nb11,
|
||||||
|
uint64_t nb12,
|
||||||
|
uint64_t nb13,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t /*ne1*/,
|
||||||
|
int64_t /*ne2*/,
|
||||||
|
int64_t /*ne3*/,
|
||||||
|
uint64_t nb0,
|
||||||
|
uint64_t nb1,
|
||||||
|
uint64_t nb2,
|
||||||
|
uint64_t nb3,
|
||||||
|
int32_t dim) {
|
||||||
|
const int64_t i3 = blockIdx.z;
|
||||||
|
const int64_t i2 = blockIdx.y;
|
||||||
|
const int64_t i1 = blockIdx.x;
|
||||||
|
|
||||||
|
int64_t o[4] = {0, 0, 0, 0};
|
||||||
|
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||||
|
|
||||||
|
const float * x;
|
||||||
|
|
||||||
|
for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||||
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||||
|
} else {
|
||||||
|
x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||||
|
}
|
||||||
|
|
||||||
|
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
*y = *x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
const float * src0_d = (const float *)src0->data;
|
|
||||||
const float * src1_d = (const float *)src1->data;
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
const float * src1_d = (const float *)src1->data;
|
||||||
|
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
|
||||||
|
if (dim != 3) {
|
||||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||||
concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
|
concat_f32_cuda(
|
||||||
|
src0_d + i3 * (src0->nb[3] / 4),
|
||||||
|
src1_d + i3 * (src1->nb[3] / 4),
|
||||||
|
dst_d + i3 * ( dst->nb[3] / 4),
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||||
|
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const size_t size0 = ggml_nbytes(src0);
|
||||||
|
const size_t size1 = ggml_nbytes(src1);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||||
|
concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||||
|
(const char *)src0->data,
|
||||||
|
(const char *)src1->data,
|
||||||
|
( char *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||||
|
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
||||||
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -170,6 +170,8 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -188,6 +190,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -202,6 +206,8 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ static __global__ void rope(
|
||||||
template<typename T, bool has_pos, bool has_freq_facs>
|
template<typename T, bool has_pos, bool has_freq_facs>
|
||||||
static __global__ void rope_neox(
|
static __global__ void rope_neox(
|
||||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
|
||||||
) {
|
) {
|
||||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
@ -85,15 +85,13 @@ static __global__ void rope_neox(
|
||||||
const int i = row*ncols + ib*n_dims + ic/2;
|
const int i = row*ncols + ib*n_dims + ic/2;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||||
|
|
||||||
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
|
const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
const float x0 = x[i + 0];
|
||||||
const float x1 = x[i + n_dims/2];
|
const float x1 = x[i + n_dims/2];
|
||||||
|
@ -174,30 +172,29 @@ static void rope_neox_cuda(
|
||||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.0f / n_dims;
|
|
||||||
|
|
||||||
if (pos == nullptr) {
|
if (pos == nullptr) {
|
||||||
if (freq_factors == nullptr) {
|
if (freq_factors == nullptr) {
|
||||||
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (freq_factors == nullptr) {
|
if (freq_factors == nullptr) {
|
||||||
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -254,6 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
|
@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
|
|
||||||
// TODO: assert that dim2 and dim3 are contiguous
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
|
|
@ -990,6 +990,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
||||||
|
|
||||||
|
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
@ -1018,6 +1020,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||||
|
[encoder setBytes:&dim length:sizeof(dim) atIndex:27];
|
||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN(1024, ne0);
|
||||||
|
|
||||||
|
@ -1516,7 +1519,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
|
|
||||||
// TODO: assert that dim2 and dim3 are contiguous
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
@ -2184,6 +2186,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
@ -2211,6 +2214,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
//float eps;
|
//float eps;
|
||||||
//memcpy(&eps, dst->op_params, sizeof(float));
|
//memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
@ -2244,6 +2248,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
|
|
@ -1767,13 +1767,13 @@ kernel void kernel_rope(
|
||||||
|
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
const float theta_0 = (float)p;
|
const float theta_base = (float)p;
|
||||||
const float inv_ndims = -1.f/n_dims;
|
const float inv_ndims = -1.f/n_dims;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||||
|
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
@ -1789,18 +1789,14 @@ kernel void kernel_rope(
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = src2 != src0 ? src2[i0] : 1.0f;
|
||||||
const float cur_rot = inv_ndims*ic - ib;
|
|
||||||
const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
|
const float theta = theta_base * pow(freq_base, inv_ndims*ic);
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -3366,31 +3362,30 @@ kernel void kernel_concat(
|
||||||
constant uint64_t & nb1,
|
constant uint64_t & nb1,
|
||||||
constant uint64_t & nb2,
|
constant uint64_t & nb2,
|
||||||
constant uint64_t & nb3,
|
constant uint64_t & nb3,
|
||||||
|
constant int32_t & dim,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
const int64_t i03 = tgpig.z;
|
const int64_t i3 = tgpig.z;
|
||||||
const int64_t i02 = tgpig.y;
|
const int64_t i2 = tgpig.y;
|
||||||
const int64_t i01 = tgpig.x;
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
const int64_t i13 = i03 % ne13;
|
int64_t o[4] = {0, 0, 0, 0};
|
||||||
const int64_t i12 = i02 % ne12;
|
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||||
const int64_t i11 = i01 % ne11;
|
|
||||||
|
|
||||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
device const float * x;
|
||||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
|
||||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
|
||||||
|
|
||||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
if (i02 < ne02) {
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
x = (device const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||||
src0_ptr += ntg.x*nb00;
|
|
||||||
} else {
|
} else {
|
||||||
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||||
src1_ptr += ntg.x*nb10;
|
|
||||||
}
|
}
|
||||||
dst_ptr += ntg.x*nb0;
|
|
||||||
|
device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
*y = *x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
127
ggml-rpc.cpp
127
ggml-rpc.cpp
|
@ -6,6 +6,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
@ -47,6 +48,7 @@ struct socket_t {
|
||||||
sockfd_t fd;
|
sockfd_t fd;
|
||||||
socket_t(sockfd_t fd) : fd(fd) {}
|
socket_t(sockfd_t fd) : fd(fd) {}
|
||||||
~socket_t() {
|
~socket_t() {
|
||||||
|
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
closesocket(this->fd);
|
closesocket(this->fd);
|
||||||
#else
|
#else
|
||||||
|
@ -97,7 +99,7 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_rpc_buffer_type_context {
|
struct ggml_backend_rpc_buffer_type_context {
|
||||||
std::shared_ptr<socket_t> sock;
|
std::string endpoint;
|
||||||
std::string name;
|
std::string name;
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
size_t max_size;
|
size_t max_size;
|
||||||
|
@ -106,8 +108,6 @@ struct ggml_backend_rpc_buffer_type_context {
|
||||||
struct ggml_backend_rpc_context {
|
struct ggml_backend_rpc_context {
|
||||||
std::string endpoint;
|
std::string endpoint;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::shared_ptr<socket_t> sock;
|
|
||||||
ggml_backend_buffer_type_t buft;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_rpc_buffer_context {
|
struct ggml_backend_rpc_buffer_context {
|
||||||
|
@ -231,14 +231,13 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool parse_endpoint(const char * endpoint, std::string & host, int & port) {
|
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
||||||
std::string str(endpoint);
|
size_t pos = endpoint.find(':');
|
||||||
size_t pos = str.find(':');
|
|
||||||
if (pos == std::string::npos) {
|
if (pos == std::string::npos) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
host = str.substr(0, pos);
|
host = endpoint.substr(0, pos);
|
||||||
port = std::stoi(str.substr(pos + 1));
|
port = std::stoi(endpoint.substr(pos + 1));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -273,6 +272,44 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||||
|
|
||||||
// RPC client-side implementation
|
// RPC client-side implementation
|
||||||
|
|
||||||
|
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||||
|
static std::mutex mutex;
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
||||||
|
static bool initialized = false;
|
||||||
|
|
||||||
|
auto it = sockets.find(endpoint);
|
||||||
|
if (it != sockets.end()) {
|
||||||
|
if (auto sock = it->second.lock()) {
|
||||||
|
return sock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::string host;
|
||||||
|
int port;
|
||||||
|
if (!parse_endpoint(endpoint, host, port)) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#ifdef _WIN32
|
||||||
|
if (!initialized) {
|
||||||
|
WSADATA wsaData;
|
||||||
|
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||||
|
if (res != 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
UNUSED(initialized);
|
||||||
|
#endif
|
||||||
|
auto sock = socket_connect(host.c_str(), port);
|
||||||
|
if (sock == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||||
|
sockets[endpoint] = sock;
|
||||||
|
return sock;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
|
@ -442,7 +479,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||||
std::vector<uint8_t> input(input_size, 0);
|
std::vector<uint8_t> input(input_size, 0);
|
||||||
memcpy(input.data(), &size, sizeof(size));
|
memcpy(input.data(), &size, sizeof(size));
|
||||||
std::vector<uint8_t> output;
|
std::vector<uint8_t> output;
|
||||||
bool status = send_rpc_cmd(buft_ctx->sock, ALLOC_BUFFER, input, output);
|
auto sock = get_socket(buft_ctx->endpoint);
|
||||||
|
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
|
||||||
GGML_ASSERT(status);
|
GGML_ASSERT(status);
|
||||||
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
||||||
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
||||||
|
@ -453,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||||
if (remote_ptr != 0) {
|
if (remote_ptr != 0) {
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||||
ggml_backend_rpc_buffer_interface,
|
ggml_backend_rpc_buffer_interface,
|
||||||
new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
|
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
||||||
remote_size);
|
remote_size);
|
||||||
return buffer;
|
return buffer;
|
||||||
} else {
|
} else {
|
||||||
|
@ -508,7 +546,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend
|
||||||
}
|
}
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
return buft_ctx->sock == rpc_ctx->sock;
|
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
|
@ -521,7 +559,6 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
|
|
||||||
|
@ -530,16 +567,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
|
|
||||||
delete buft_ctx;
|
|
||||||
delete rpc_ctx->buft;
|
|
||||||
delete rpc_ctx;
|
delete rpc_ctx;
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
return ctx->buft;
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||||
|
@ -590,7 +624,8 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
||||||
std::vector<uint8_t> input;
|
std::vector<uint8_t> input;
|
||||||
serialize_graph(cgraph, input);
|
serialize_graph(cgraph, input);
|
||||||
std::vector<uint8_t> output;
|
std::vector<uint8_t> output;
|
||||||
bool status = send_rpc_cmd(rpc_ctx->sock, GRAPH_COMPUTE, input, output);
|
auto sock = get_socket(rpc_ctx->endpoint);
|
||||||
|
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
|
||||||
GGML_ASSERT(status);
|
GGML_ASSERT(status);
|
||||||
GGML_ASSERT(output.size() == 1);
|
GGML_ASSERT(output.size() == 1);
|
||||||
return (enum ggml_status)output[0];
|
return (enum ggml_status)output[0];
|
||||||
|
@ -624,42 +659,24 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
/* .event_synchronize = */ NULL,
|
/* .event_synchronize = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::unordered_map<std::string, ggml_backend_t> instances;
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
static std::mutex mutex;
|
||||||
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
// NOTE: buffer types are allocated and never freed; this is by design
|
||||||
|
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
||||||
|
auto it = buft_map.find(endpoint);
|
||||||
|
if (it != buft_map.end()) {
|
||||||
|
return it->second;
|
||||||
}
|
}
|
||||||
|
auto sock = get_socket(endpoint);
|
||||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
||||||
std::string endpoint_str(endpoint);
|
|
||||||
if (instances.find(endpoint_str) != instances.end()) {
|
|
||||||
return instances[endpoint_str];
|
|
||||||
}
|
|
||||||
#ifdef _WIN32
|
|
||||||
{
|
|
||||||
WSADATA wsaData;
|
|
||||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
|
||||||
if (res != 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
fprintf(stderr, "Connecting to %s\n", endpoint);
|
|
||||||
std::string host;
|
|
||||||
int port;
|
|
||||||
if (!parse_endpoint(endpoint, host, port)) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
auto sock = socket_connect(host.c_str(), port);
|
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
size_t alignment = get_alignment(sock);
|
size_t alignment = get_alignment(sock);
|
||||||
size_t max_size = get_max_size(sock);
|
size_t max_size = get_max_size(sock);
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
||||||
/* .sock = */ sock,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||||
/* .alignment = */ alignment,
|
/* .alignment = */ alignment,
|
||||||
/* .max_size = */ max_size
|
/* .max_size = */ max_size
|
||||||
};
|
};
|
||||||
|
@ -668,21 +685,22 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
||||||
/* .context = */ buft_ctx
|
/* .context = */ buft_ctx
|
||||||
};
|
};
|
||||||
|
buft_map[endpoint] = buft;
|
||||||
|
return buft;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||||
/* .endpoint = */ endpoint,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
/* .name = */ "RPC",
|
||||||
/* .sock = */ sock,
|
|
||||||
/* .buft = */ buft
|
|
||||||
};
|
};
|
||||||
|
|
||||||
instances[endpoint] = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
/* .guid = */ ggml_backend_rpc_guid(),
|
/* .guid = */ ggml_backend_rpc_guid(),
|
||||||
/* .interface = */ ggml_backend_rpc_interface,
|
/* .interface = */ ggml_backend_rpc_interface,
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx
|
||||||
};
|
};
|
||||||
|
return backend;
|
||||||
return instances[endpoint];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||||
|
@ -706,14 +724,13 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
auto sock = get_socket(endpoint);
|
||||||
if (backend == nullptr) {
|
if (sock == nullptr) {
|
||||||
*free = 0;
|
*free = 0;
|
||||||
*total = 0;
|
*total = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
get_device_memory(sock, free, total);
|
||||||
get_device_memory(ctx->sock, free, total);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// RPC server-side implementation
|
// RPC server-side implementation
|
||||||
|
|
393
ggml-sycl.cpp
393
ggml-sycl.cpp
|
@ -2944,6 +2944,57 @@ namespace dpct
|
||||||
using shared_memory = detail::device_memory<T, shared, Dimension>;
|
using shared_memory = detail::device_memory<T, shared, Dimension>;
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T,
|
||||||
|
sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||||
|
inline T atomic_fetch_add(T *addr, T operand) {
|
||||||
|
auto atm =
|
||||||
|
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||||
|
return atm.fetch_add(operand);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||||
|
typename T1, typename T2>
|
||||||
|
inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
|
||||||
|
auto atm =
|
||||||
|
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||||
|
return atm.fetch_add(operand);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space>
|
||||||
|
inline T atomic_fetch_add(T *addr, T operand,
|
||||||
|
sycl::memory_order memoryOrder) {
|
||||||
|
switch (memoryOrder) {
|
||||||
|
case sycl::memory_order::relaxed:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
case sycl::memory_order::acq_rel:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
case sycl::memory_order::seq_cst:
|
||||||
|
return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
|
||||||
|
sycl::memory_scope::device>(addr, operand);
|
||||||
|
default:
|
||||||
|
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||||
|
"atomics are: sycl::memory_order::relaxed, "
|
||||||
|
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <sycl::access::address_space addressSpace =
|
||||||
|
sycl::access::address_space::global_space,
|
||||||
|
typename T1, typename T2>
|
||||||
|
inline T1 atomic_fetch_add(T1 *addr, T2 operand,
|
||||||
|
sycl::memory_order memoryOrder) {
|
||||||
|
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
||||||
|
}
|
||||||
|
|
||||||
} // COPY from DPCT head files
|
} // COPY from DPCT head files
|
||||||
|
|
||||||
#define GGML_COMMON_DECL_SYCL
|
#define GGML_COMMON_DECL_SYCL
|
||||||
|
@ -2971,20 +3022,19 @@ static int g_work_group_size = 0;
|
||||||
// typedef sycl::half ggml_fp16_t;
|
// typedef sycl::half ggml_fp16_t;
|
||||||
|
|
||||||
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
||||||
#define VER_4VEC 610 //todo for hardward optimize.
|
#define VER_4VEC 130 //todo for hardward optimize.
|
||||||
#define VER_GEN9 700 //todo for hardward optimize.
|
#define VER_GEN9 700 //todo for hardward optimize.
|
||||||
#define VER_GEN12 1000000 //todo for hardward optimize.
|
#define VER_GEN12 1000000 //todo for hardward optimize.
|
||||||
#define VER_GEN13 (VER_GEN12 + 1030) //todo for hardward optimize.
|
#define VER_GEN13 (VER_GEN12 + 1030) //todo for hardward optimize.
|
||||||
|
|
||||||
#define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
|
#define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
|
||||||
|
|
||||||
|
#if !defined(GGML_SYCL_FORCE_MMQ)
|
||||||
//define for XMX in Intel GPU
|
|
||||||
//TODO: currently, it's not used for XMX really.
|
|
||||||
#define SYCL_USE_XMX
|
#define SYCL_USE_XMX
|
||||||
|
#endif
|
||||||
|
|
||||||
// max batch size to use MMQ kernels when tensor cores are available
|
// max batch size to use MMQ kernels when tensor cores are available
|
||||||
#define XMX_MAX_BATCH_SIZE 32
|
#define MMQ_MAX_BATCH_SIZE 32
|
||||||
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
@ -3060,6 +3110,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
|
||||||
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||||
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||||
int get_main_device();
|
int get_main_device();
|
||||||
|
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
|
||||||
void print_ggml_tensor(const char*name, struct ggml_tensor *src);
|
void print_ggml_tensor(const char*name, struct ggml_tensor *src);
|
||||||
void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
|
void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
|
||||||
|
|
||||||
|
@ -13512,6 +13563,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
float *dst_dd,
|
float *dst_dd,
|
||||||
const dpct::queue_ptr &main_stream) {
|
const dpct::queue_ptr &main_stream) {
|
||||||
|
#pragma message("TODO: generalize concat kernel for dim != 2")
|
||||||
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
||||||
|
int dim = dst->op_params[0];
|
||||||
|
GGML_ASSERT(dim == 2);
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
@ -15128,7 +15183,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
const int64_t r2 = ne12/ne02;
|
const int64_t r2 = ne12/ne02;
|
||||||
const int64_t r3 = ne13/ne03;
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||||
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
|
@ -15193,6 +15248,29 @@ catch (sycl::exception const &exc) {
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||||
|
// TODO: accuracy issues in MMQ
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
|
@ -15209,78 +15287,44 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SYCL_USE_XMX
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||||
const bool use_xmx = true;
|
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
|
||||||
#else
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
const bool use_xmx = false;
|
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
#endif
|
|
||||||
|
|
||||||
// debug helpers
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
|
||||||
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
|
||||||
|
|
||||||
if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
||||||
// KQ single-batch
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
|
|
||||||
ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
|
|
||||||
} else if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
|
||||||
// KQV single-batch
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
|
|
||||||
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
|
||||||
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
|
||||||
// KQ + KQV multi-batch
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
|
|
||||||
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
|
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
|
||||||
// GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
|
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
|
|
||||||
#ifdef GGML_SYCL_FORCE_DMMV
|
|
||||||
const bool use_mul_mat_vec_q = false;
|
|
||||||
#else
|
|
||||||
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
|
||||||
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
|
||||||
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
|
||||||
(src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
|
|
||||||
(src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
|
|
||||||
|
|
||||||
|
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
||||||
#endif // GGML_SYCL_FORCE_DMMV
|
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||||
|
|
||||||
if (use_mul_mat_vec_q) {
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
|
||||||
} else {
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
|
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
|
||||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||||
|
#ifdef SYCL_USE_XMX
|
||||||
|
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
|
#endif // SYCL_USE_XMX
|
||||||
|
|
||||||
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
use_mul_mat_q = false;
|
// KQ single-batch
|
||||||
}
|
ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
|
||||||
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
if (use_mul_mat_q) {
|
// KQV single-batch
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
|
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
||||||
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
|
// KQ + KQV multi-batch
|
||||||
|
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
||||||
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
||||||
|
} else if (use_mul_mat_vec_q) {
|
||||||
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
||||||
|
} else if (use_mul_mat_q) {
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
||||||
} else {
|
} else {
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
|
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
template<typename ... Srcs>
|
template<typename ... Srcs>
|
||||||
|
@ -15455,19 +15499,83 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct mmid_row_mapping {
|
||||||
|
int32_t i1;
|
||||||
|
int32_t i2;
|
||||||
|
};
|
||||||
|
|
||||||
|
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
||||||
|
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
||||||
|
int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
|
||||||
|
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
||||||
|
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
||||||
|
const sycl::nd_item<3> &item_ct1, int &src1_row) {
|
||||||
|
int32_t iid1 = item_ct1.get_group(2);
|
||||||
|
int32_t id = item_ct1.get_group(1);
|
||||||
|
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
||||||
|
|
||||||
|
if (row_id_i != i02) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t i11 = id % ne11;
|
||||||
|
const int64_t i12 = iid1;
|
||||||
|
|
||||||
|
if (item_ct1.get_local_id(2) == 0) {
|
||||||
|
src1_row =
|
||||||
|
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
||||||
|
cur_src1_row, 1);
|
||||||
|
row_mapping[src1_row] = {id, iid1};
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
||||||
|
performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
|
||||||
|
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
||||||
|
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = item_ct1.get_local_id(2); i < ne10;
|
||||||
|
i += item_ct1.get_local_range(2)) {
|
||||||
|
src1_row_contiguous[i] = src1_row_original[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__dpct_inline__ static void k_copy_dst_from_contiguous(
|
||||||
|
char *__restrict__ dst_original, const char *__restrict__ dst_contiguous,
|
||||||
|
const mmid_row_mapping *__restrict__ row_mapping, int64_t ne0, size_t nb1,
|
||||||
|
size_t nb2, const sycl::nd_item<3> &item_ct1) {
|
||||||
|
int32_t i = item_ct1.get_group(2);
|
||||||
|
|
||||||
|
const int32_t i1 = row_mapping[i].i1;
|
||||||
|
const int32_t i2 = row_mapping[i].i2;
|
||||||
|
|
||||||
|
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
||||||
|
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = item_ct1.get_local_id(2); j < ne0;
|
||||||
|
j += item_ct1.get_local_range(2)) {
|
||||||
|
dst_row_original[j] = dst_row_contiguous[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1,
|
const ggml_tensor *src1,
|
||||||
ggml_tensor *dst) try {
|
ggml_tensor *dst) try {
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
|
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||||
"mul_mat_id does not support split buffers");
|
|
||||||
const ggml_tensor *ids = dst->src[2];
|
const ggml_tensor *ids = dst->src[2];
|
||||||
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
||||||
|
|
||||||
const size_t nb11 = src1->nb[1];
|
const int64_t n_as = ne02;
|
||||||
const size_t nb1 = dst->nb[1];
|
const int64_t n_ids = ids->ne[0];
|
||||||
|
|
||||||
const int32_t id = ((int32_t *)dst->op_params)[0];
|
|
||||||
const int32_t n_as = src0->ne[2];
|
|
||||||
|
|
||||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||||
const char * ids_dev = (const char *) ids->data;
|
const char * ids_dev = (const char *) ids->data;
|
||||||
|
@ -15510,25 +15618,41 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
src0_row.ne[2] = 1;
|
src0_row.ne[2] = 1;
|
||||||
src0_row.ne[3] = 1;
|
src0_row.ne[3] = 1;
|
||||||
src0_row.nb[3] = src0->nb[2];
|
src0_row.nb[3] = nb02;
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
src1_row.ne[1] = 1;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
src1_row.ne[2] = 1;
|
||||||
const int32_t row_id =
|
src1_row.ne[3] = 1;
|
||||||
*(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
|
src1_row.nb[2] = nb11;
|
||||||
id * ids->nb[0]);
|
src1_row.nb[3] = nb11;
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
dst_row.ne[1] = 1;
|
||||||
|
dst_row.ne[2] = 1;
|
||||||
|
dst_row.ne[3] = 1;
|
||||||
|
dst_row.nb[2] = nb1;
|
||||||
|
dst_row.nb[3] = nb1;
|
||||||
|
if (ne12 == 1) {
|
||||||
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||||
|
for (int64_t id = 0; id < n_ids; id++) {
|
||||||
|
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||||
|
|
||||||
|
const int64_t i11 = id % ne11;
|
||||||
|
const int64_t i12 = iid1;
|
||||||
|
|
||||||
|
const int64_t i1 = id;
|
||||||
|
const int64_t i2 = i12;
|
||||||
|
|
||||||
src0_row_extra.data_device[g_main_device] =
|
src0_row_extra.data_device[g_main_device] =
|
||||||
src0_original + row_id * src0->nb[2];
|
src0_original + i02*nb02;
|
||||||
src1_row_extra.data_device[g_main_device] =
|
src1_row_extra.data_device[g_main_device] =
|
||||||
src1_original + i01 * src1->nb[1];
|
src1_original + + i11*nb11 + i12*nb12;
|
||||||
dst_row_extra.data_device[g_main_device] =
|
dst_row_extra.data_device[g_main_device] =
|
||||||
dst_original + i01 * dst->nb[1];
|
dst_original + i1*nb1 + i2*nb2;
|
||||||
|
|
||||||
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
||||||
sycl_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
sycl_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
||||||
|
@ -15536,64 +15660,98 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||||
int64_t num_src1_rows = 0;
|
int64_t num_src1_rows = 0;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
for (int64_t id = 0; id < n_ids; id++) {
|
||||||
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
if (row_id_i != row_id) {
|
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
||||||
|
|
||||||
|
if (row_id_i != i02) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
||||||
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
|
||||||
src1_original + i01 * nb11, nb11)));
|
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (num_src1_rows == 0) {
|
if (num_src1_rows == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
src0_row_extra.data_device[g_main_device] =
|
|
||||||
src0_original + row_id * src0->nb[2];
|
|
||||||
|
|
||||||
|
sycl_pool_alloc<int> dev_cur_src1_row(1);
|
||||||
|
sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(num_src1_rows);
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
|
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
||||||
|
|
||||||
|
{
|
||||||
|
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
|
||||||
|
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
||||||
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
||||||
|
|
||||||
|
char *__restrict src1_contiguous_get =
|
||||||
|
src1_contiguous.get();
|
||||||
|
int *__restrict dev_cur_src1_row_get =
|
||||||
|
dev_cur_src1_row.get();
|
||||||
|
mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||||
|
dev_row_mapping.get();
|
||||||
|
size_t ids_nb_ct6 = ids->nb[1];
|
||||||
|
size_t ids_nb_ct7 = ids->nb[0];
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
k_copy_src1_to_contiguous(
|
||||||
|
src1_original, src1_contiguous_get,
|
||||||
|
dev_cur_src1_row_get,
|
||||||
|
dev_row_mapping_get, ids_dev, i02,
|
||||||
|
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
||||||
|
item_ct1, src1_row_acc);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
src0_row_extra.data_device[g_main_device] = src0_original + i02*nb02;
|
||||||
|
|
||||||
|
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||||
|
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||||
src1_row.ne[1] = num_src1_rows;
|
src1_row.ne[1] = num_src1_rows;
|
||||||
dst_row.ne[1] = num_src1_rows;
|
|
||||||
|
|
||||||
src1_row.nb[1] = nb11;
|
src1_row.nb[1] = nb11;
|
||||||
src1_row.nb[2] = num_src1_rows*nb11;
|
src1_row.nb[2] = num_src1_rows*nb11;
|
||||||
src1_row.nb[3] = num_src1_rows*nb11;
|
src1_row.nb[3] = num_src1_rows*nb11;
|
||||||
|
|
||||||
|
dst_row.ne[1] = num_src1_rows;
|
||||||
dst_row.nb[1] = nb1;
|
dst_row.nb[1] = nb1;
|
||||||
dst_row.nb[2] = num_src1_rows*nb1;
|
dst_row.nb[2] = num_src1_rows*nb1;
|
||||||
dst_row.nb[3] = num_src1_rows*nb1;
|
dst_row.nb[3] = num_src1_rows*nb1;
|
||||||
|
|
||||||
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
|
|
||||||
num_src1_rows = 0;
|
{
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
|
||||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
||||||
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
const char *__restrict dst_contiguous_get =
|
||||||
|
dst_contiguous.get();
|
||||||
|
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||||
|
dev_row_mapping.get();
|
||||||
|
|
||||||
if (row_id_i != row_id) {
|
cgh.parallel_for(
|
||||||
continue;
|
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||||
}
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
k_copy_dst_from_contiguous(dst_original,
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
dst_contiguous_get,
|
||||||
|
dev_row_mapping_get,
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
ne0, nb1, nb2, item_ct1);
|
||||||
dst_original + i01 * nb1,
|
});
|
||||||
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
});
|
||||||
num_src1_rows++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -16576,10 +16734,9 @@ GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backe
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// unused at the moment
|
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
||||||
//static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
||||||
// return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
}
|
||||||
//}
|
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||||
|
|
|
@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
||||||
|
ggml_vk_instance_init();
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
158
ggml.c
158
ggml.c
|
@ -60,6 +60,9 @@
|
||||||
|
|
||||||
typedef volatile LONG atomic_int;
|
typedef volatile LONG atomic_int;
|
||||||
typedef atomic_int atomic_bool;
|
typedef atomic_int atomic_bool;
|
||||||
|
typedef atomic_int atomic_flag;
|
||||||
|
|
||||||
|
#define ATOMIC_FLAG_INIT 0
|
||||||
|
|
||||||
static void atomic_store(atomic_int * ptr, LONG val) {
|
static void atomic_store(atomic_int * ptr, LONG val) {
|
||||||
InterlockedExchange(ptr, val);
|
InterlockedExchange(ptr, val);
|
||||||
|
@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
||||||
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
||||||
return atomic_fetch_add(ptr, -(dec));
|
return atomic_fetch_add(ptr, -(dec));
|
||||||
}
|
}
|
||||||
|
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
||||||
|
return InterlockedExchange(ptr, 1);
|
||||||
|
}
|
||||||
|
static void atomic_flag_clear(atomic_flag * ptr) {
|
||||||
|
InterlockedExchange(ptr, 0);
|
||||||
|
}
|
||||||
|
|
||||||
typedef HANDLE pthread_t;
|
typedef HANDLE pthread_t;
|
||||||
|
|
||||||
|
@ -2883,24 +2892,20 @@ struct ggml_state {
|
||||||
|
|
||||||
// global state
|
// global state
|
||||||
static struct ggml_state g_state;
|
static struct ggml_state g_state;
|
||||||
static atomic_int g_state_barrier = 0;
|
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
// barrier via spin lock
|
// barrier via spin lock
|
||||||
inline static void ggml_critical_section_start(void) {
|
inline static void ggml_critical_section_start(void) {
|
||||||
int processing = atomic_fetch_add(&g_state_barrier, 1);
|
while (atomic_flag_test_and_set(&g_state_critical)) {
|
||||||
|
// spin
|
||||||
while (processing > 0) {
|
sched_yield();
|
||||||
// wait for other threads to finish
|
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
|
||||||
sched_yield(); // TODO: reconsider this
|
|
||||||
processing = atomic_fetch_add(&g_state_barrier, 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: make this somehow automatically executed
|
// TODO: make this somehow automatically executed
|
||||||
// some sort of "sentry" mechanism
|
// some sort of "sentry" mechanism
|
||||||
inline static void ggml_critical_section_end(void) {
|
inline static void ggml_critical_section_end(void) {
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
atomic_flag_clear(&g_state_critical);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
|
@ -3216,7 +3221,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
||||||
|
return ggml_is_contiguous(tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -3225,6 +3234,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return
|
||||||
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||||
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
@ -4884,8 +4901,19 @@ struct ggml_tensor * ggml_repeat_back(
|
||||||
struct ggml_tensor * ggml_concat(
|
struct ggml_tensor * ggml_concat(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor* b) {
|
struct ggml_tensor * b,
|
||||||
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
int dim) {
|
||||||
|
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
||||||
|
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||||||
|
if (d == dim) {
|
||||||
|
ne[d] = a->ne[d] + b->ne[d];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||||||
|
ne[d] = a->ne[d];
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -4893,7 +4921,9 @@ struct ggml_tensor * ggml_concat(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, dim);
|
||||||
|
|
||||||
result->op = GGML_OP_CONCAT;
|
result->op = GGML_OP_CONCAT;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -5013,6 +5043,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
||||||
|
|
||||||
result->op = GGML_OP_LEAKY_RELU;
|
result->op = GGML_OP_LEAKY_RELU;
|
||||||
|
@ -6378,6 +6409,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int n_dims,
|
||||||
|
float base,
|
||||||
|
bool down) {
|
||||||
|
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_rope_back
|
// ggml_rope_back
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rope_back(
|
struct ggml_tensor * ggml_rope_back(
|
||||||
|
@ -10967,31 +11008,34 @@ static void ggml_compute_forward_concat_f32(
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
|
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||||
|
|
||||||
|
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||||
|
|
||||||
|
int64_t o[4] = {0, 0, 0, 0};
|
||||||
|
o[dim] = src0->ne[dim];
|
||||||
|
|
||||||
|
const float * x;
|
||||||
|
|
||||||
|
// TODO: smarter multi-theading
|
||||||
for (int i3 = 0; i3 < ne3; i3++) {
|
for (int i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||||
if (i2 < ne02) { // src0
|
|
||||||
for (int i1 = 0; i1 < ne1; i1++) {
|
for (int i1 = 0; i1 < ne1; i1++) {
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
for (int i0 = 0; i0 < ne0; i0++) {
|
||||||
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
|
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||||
|
} else {
|
||||||
|
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||||
|
}
|
||||||
|
|
||||||
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||||
*y = *x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // src1
|
|
||||||
else {
|
|
||||||
for (int i1 = 0; i1 < ne1; i1++) {
|
|
||||||
for (int i0 = 0; i0 < ne0; i0++) {
|
|
||||||
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
|
||||||
|
|
||||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
|
||||||
*y = *x;
|
*y = *x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_concat(
|
static void ggml_compute_forward_concat(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
|
@ -11388,8 +11432,8 @@ static void ggml_compute_forward_gelu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11451,8 +11495,8 @@ static void ggml_compute_forward_gelu_quick_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11514,8 +11558,8 @@ static void ggml_compute_forward_silu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11626,9 +11670,9 @@ static void ggml_compute_forward_silu_back_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * grad = dst->src[1];
|
const struct ggml_tensor * grad = dst->src[1];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
|
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
||||||
|
|
||||||
|
@ -14326,7 +14370,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.f/n_dims;
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
|
@ -14410,29 +14454,22 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
|
||||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
|
||||||
theta_base *= freq_scale;
|
|
||||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
|
||||||
sin_theta *= sin_sign;
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
@ -14511,7 +14548,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.f/n_dims;
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
|
@ -14591,29 +14628,22 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
|
||||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
|
||||||
theta_base *= freq_scale;
|
|
||||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
|
||||||
sin_theta *= sin_sign;
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
@ -22840,6 +22870,14 @@ int ggml_cpu_has_sycl(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_rpc(void) {
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||||
ggml_cpu_has_sycl();
|
ggml_cpu_has_sycl();
|
||||||
|
|
20
ggml.h
20
ggml.h
|
@ -756,7 +756,6 @@ extern "C" {
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
|
||||||
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
|
@ -765,6 +764,11 @@ extern "C" {
|
||||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
|
@ -1007,12 +1011,13 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// concat a and b on dim 2
|
// concat a and b along dim
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
GGML_API struct ggml_tensor * ggml_concat(
|
GGML_API struct ggml_tensor * ggml_concat(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int dim);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_abs(
|
GGML_API struct ggml_tensor * ggml_abs(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1547,6 +1552,14 @@ extern "C" {
|
||||||
float beta_slow),
|
float beta_slow),
|
||||||
"use ggml_rope_ext_inplace instead");
|
"use ggml_rope_ext_inplace instead");
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int n_dims,
|
||||||
|
float base,
|
||||||
|
bool down);
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
// compute correction dims for YaRN RoPE scaling
|
||||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
@ -2419,6 +2432,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
|
GGML_API int ggml_cpu_has_rpc (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
|
||||||
|
|
|
@ -2670,14 +2670,12 @@ void main() {
|
||||||
const uint i = row*p.ncols + ib*p.ndims + ic/2;
|
const uint i = row*p.ncols + ib*p.ndims + ic/2;
|
||||||
const uint i2 = row/p.p_delta_rows;
|
const uint i2 = row/p.p_delta_rows;
|
||||||
|
|
||||||
const float cur_rot = p.inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int pos = data_b[i2];
|
const int pos = data_b[i2];
|
||||||
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
||||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
|
rope_yarn(theta_base, ic, cos_theta, sin_theta);
|
||||||
|
|
||||||
const float x0 = float(data_a[i + 0]);
|
const float x0 = float(data_a[i + 0]);
|
||||||
const float x1 = float(data_a[i + p.ndims/2]);
|
const float x1 = float(data_a[i + p.ndims/2]);
|
||||||
|
|
|
@ -37,11 +37,15 @@ class Keys:
|
||||||
CONTEXT_LENGTH = "{arch}.context_length"
|
CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
BLOCK_COUNT = "{arch}.block_count"
|
BLOCK_COUNT = "{arch}.block_count"
|
||||||
|
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
||||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
|
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
||||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
|
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
||||||
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
|
||||||
|
@ -55,6 +59,8 @@ class Keys:
|
||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -64,6 +70,7 @@ class Keys:
|
||||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
|
@ -140,6 +147,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
|
DEEPSEEK2 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -185,6 +193,12 @@ class MODEL_TENSOR(IntEnum):
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
|
ATTN_Q_A = auto()
|
||||||
|
ATTN_Q_B = auto()
|
||||||
|
ATTN_KV_A_MQA = auto()
|
||||||
|
ATTN_KV_B = auto()
|
||||||
|
ATTN_Q_A_NORM = auto()
|
||||||
|
ATTN_KV_A_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -221,6 +235,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -266,6 +281,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||||
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -757,6 +778,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DEEPSEEK2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A,
|
||||||
|
MODEL_TENSOR.ATTN_Q_B,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||||
|
MODEL_TENSOR.ATTN_KV_B,
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -790,6 +838,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DEEPSEEK2: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -374,9 +374,15 @@ class GGUFWriter:
|
||||||
def add_block_count(self, length: int) -> None:
|
def add_block_count(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_leading_dense_block_count(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_feed_forward_length(self, length: int) -> None:
|
def add_feed_forward_length(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_expert_feed_forward_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
@ -407,6 +413,12 @@ class GGUFWriter:
|
||||||
def add_expert_used_count(self, count: int) -> None:
|
def add_expert_used_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_shared_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_weights_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -416,6 +428,12 @@ class GGUFWriter:
|
||||||
def add_causal_attention(self, value: bool) -> None:
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_q_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -440,6 +458,9 @@ class GGUFWriter:
|
||||||
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_conv_kernel(self, value: int) -> None:
|
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
|
|
@ -256,6 +256,7 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -285,6 +286,7 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
@ -320,6 +322,7 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
|
@ -383,6 +386,30 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_B: (
|
||||||
|
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_B: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
@ -415,7 +442,7 @@ class TensorNameMap:
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 128
|
n_experts = 160
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
|
|
@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# src/ggml-kompute.h -> ggml-kompute.h
|
# src/ggml-kompute.h -> ggml-kompute.h
|
||||||
# src/ggml-metal.h -> ggml-metal.h
|
# src/ggml-metal.h -> ggml-metal.h
|
||||||
# src/ggml-metal.m -> ggml-metal.m
|
# src/ggml-metal.m -> ggml-metal.m
|
||||||
# src/ggml-mpi.h -> ggml-mpi.h
|
|
||||||
# src/ggml-mpi.c -> ggml-mpi.c
|
|
||||||
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
||||||
# src/ggml-opencl.h -> ggml-opencl.h
|
# src/ggml-opencl.h -> ggml-opencl.h
|
||||||
# src/ggml-quants.c -> ggml-quants.c
|
# src/ggml-quants.c -> ggml-quants.c
|
||||||
|
@ -145,8 +143,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
|
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
|
||||||
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
||||||
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
||||||
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
|
||||||
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
|
|
||||||
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
||||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
126d34985705a5a2222723c145cb4e125ac689f3
|
2aae01fd9b8f9399f343cf18f46f38996ef52e2c
|
||||||
|
|
|
@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
|
||||||
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||||
cp -rpv ../ggml/src/ggml-mpi.h ./ggml-mpi.h
|
|
||||||
cp -rpv ../ggml/src/ggml-mpi.c ./ggml-mpi.c
|
|
||||||
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||||
|
|
|
@ -1138,26 +1138,37 @@ struct test_soft_max : public test_case {
|
||||||
// GGML_OP_ROPE
|
// GGML_OP_ROPE
|
||||||
struct test_rope : public test_case {
|
struct test_rope : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
const std::array<int64_t, 4> ne;
|
const std::array<int64_t, 4> ne_a;
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int mode;
|
int mode;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
float fs; // freq_scale
|
||||||
|
float ef; // ext_factor
|
||||||
|
float af; // attn_factor
|
||||||
bool ff;
|
bool ff;
|
||||||
|
int v; // view (1 : non-contiguous a)
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
|
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_rope(ggml_type type = GGML_TYPE_F32,
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
||||||
int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
|
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
||||||
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
|
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
ggml_tensor * a;
|
||||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
if (v & 1) {
|
||||||
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||||
|
} else {
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||||
|
}
|
||||||
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
||||||
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
||||||
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1165,11 +1176,11 @@ struct test_rope : public test_case {
|
||||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->type == GGML_TYPE_I32) {
|
if (t->type == GGML_TYPE_I32) {
|
||||||
// pos
|
// pos
|
||||||
std::vector<int> data(ne[2]);
|
std::vector<int> data(ne_a[2]);
|
||||||
for (int i = 0; i < ne[2]; i++) {
|
for (int i = 0; i < ne_a[2]; i++) {
|
||||||
data[i] = rand() % n_ctx;
|
data[i] = rand() % n_ctx;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
|
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
||||||
} else {
|
} else {
|
||||||
if (t->ne[0] == n_dims/2) {
|
if (t->ne[0] == n_dims/2) {
|
||||||
// frequency factors in the range [0.9f, 1.1f]
|
// frequency factors in the range [0.9f, 1.1f]
|
||||||
|
@ -1259,22 +1270,41 @@ struct test_im2col : public test_case {
|
||||||
// GGML_OP_CONCAT
|
// GGML_OP_CONCAT
|
||||||
struct test_concat : public test_case {
|
struct test_concat : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
const std::array<int64_t, 4> ne;
|
const std::array<int64_t, 4> ne_a;
|
||||||
const int64_t b_ne2;
|
const int64_t ne_b_d;
|
||||||
|
const int dim;
|
||||||
|
const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR3(type, ne, b_ne2);
|
return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_concat(ggml_type type = GGML_TYPE_F32,
|
test_concat(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
||||||
int64_t b_ne2 = 10)
|
int64_t ne_b_d = 10,
|
||||||
: type(type), ne(ne), b_ne2(b_ne2) {}
|
int dim = 2, int v = 0)
|
||||||
|
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
auto ne_b = ne_a;
|
||||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
|
ne_b[dim] = ne_b_d;
|
||||||
ggml_tensor * out = ggml_concat(ctx, a, b);
|
ggml_tensor * a;
|
||||||
|
if (v & 1) {
|
||||||
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||||
|
} else {
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||||
|
}
|
||||||
|
ggml_tensor * b;
|
||||||
|
if (v & 2) {
|
||||||
|
auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
|
||||||
|
b = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
|
||||||
|
} else {
|
||||||
|
b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
||||||
|
}
|
||||||
|
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -2194,25 +2224,47 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||||
|
|
||||||
|
{
|
||||||
|
bool all = true;
|
||||||
|
|
||||||
|
for (float v : { 0, 1 }) {
|
||||||
|
for (float fs : { 1.0f, 1.4245f }) {
|
||||||
|
for (float ef : { 0.0f, 0.7465f }) {
|
||||||
|
for (float af : { 1.0f, 1.4245f }) {
|
||||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||||
// TODO: ff not supported yet for !neox
|
// TODO: ff not supported yet for !neox
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B
|
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 7B
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B
|
if (all) {
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B
|
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 13B
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B
|
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 30B
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 65B
|
||||||
|
}
|
||||||
|
|
||||||
for (bool ff : {false, true}) { // freq_factors
|
for (bool ff : {false, true}) { // freq_factors
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
if (all) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
||||||
|
}
|
||||||
|
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
all = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
for (int v : { 0, 1, 2, 3 }) {
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
for (int dim : { 0, 1, 2, 3, }) {
|
||||||
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
|
||||||
|
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||||
|
|
|
@ -28,6 +28,8 @@ printf "Tokenizing using (cpp) llama.cpp ...\n"
|
||||||
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
||||||
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
|
||||||
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
|
|
|
@ -167,8 +167,10 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||||
for m in range(iterations):
|
for m in range(iterations):
|
||||||
rand.seed(m)
|
rand.seed(m)
|
||||||
words = rand.choices(special_tokens, k=500)
|
words = rand.choices(special_tokens, k=500)
|
||||||
if tokenizer.add_bos_token: # skip spam warning of double BOS
|
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||||
while words and words[0] == tokenizer.bos_token:
|
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
|
||||||
|
words.pop(0)
|
||||||
|
if tokenizer.add_bos_token: # drop all starting BOS
|
||||||
words.pop(0)
|
words.pop(0)
|
||||||
yield "".join(words)
|
yield "".join(words)
|
||||||
|
|
||||||
|
@ -293,15 +295,17 @@ def main(argv: list[str] = None):
|
||||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||||
|
|
||||||
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
|
|
||||||
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
|
|
||||||
|
|
||||||
def func_tokenize1(text: str):
|
def func_tokenize1(text: str):
|
||||||
return model.tokenize(text, add_special=True, parse_special=True)
|
return model.tokenize(text, add_special=True, parse_special=True)
|
||||||
|
|
||||||
def func_tokenize2(text: str):
|
def func_tokenize2(text: str):
|
||||||
return tokenizer.encode(text, add_special_tokens=True)
|
return tokenizer.encode(text, add_special_tokens=True)
|
||||||
|
|
||||||
|
ids = func_tokenize2("a")
|
||||||
|
assert 1 <= len(ids) <= 3
|
||||||
|
add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
|
||||||
|
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
|
||||||
|
|
||||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||||
|
@ -324,8 +328,10 @@ if __name__ == "__main__":
|
||||||
# import os
|
# import os
|
||||||
# tokenizers = os.listdir(path_tokenizers)
|
# tokenizers = os.listdir(path_tokenizers)
|
||||||
tokenizers = [
|
tokenizers = [
|
||||||
"llama-spm", # SPM
|
# "llama-spm", # SPM
|
||||||
"phi-3", # SPM
|
# "phi-3", # SPM
|
||||||
|
"jina-v2-en", # WPM
|
||||||
|
"bert-bge", # WPM
|
||||||
]
|
]
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue