From 589b48d41efb0e95133b77c335f4fb9779af9bfb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Sep 2024 14:38:18 +0300 Subject: [PATCH 01/40] contrib : add Resources section (#9675) --- CONTRIBUTING.md | 5 +++++ README.md | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a9e000e52..3d7c6f86c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,3 +27,8 @@ ![matmul](media/matmul.png) +# Resources + +The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: + +https://github.com/ggerganov/llama.cpp/projects diff --git a/README.md b/README.md index a452a6d78..ecc2df8ca 100644 --- a/README.md +++ b/README.md @@ -443,7 +443,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - Contributors can open PRs - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions -- Any help with managing issues and PRs is very appreciated! +- Any help with managing issues, PRs and projects is very appreciated! - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) From f99d3f8367174f7aba73c07fd87de687d4a0ece1 Mon Sep 17 00:00:00 2001 From: nopperl <54780682+nopperl@users.noreply.github.com> Date: Sun, 29 Sep 2024 12:02:06 +0000 Subject: [PATCH 02/40] py : add model class for Chameleon conversion (#9683) --- convert_hf_to_gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 96a8830e9..f3857d487 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4162,7 +4162,8 @@ class GraniteMoeModel(GraniteModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("ChameleonForCausalLM") +@Model.register("ChameleonForConditionalGeneration") +@Model.register("ChameleonForCausalLM") # obsolete class ChameleonModel(Model): model_arch = gguf.MODEL_ARCH.CHAMELEON From faac0bae265449fd988c57bf894018edc36fbe1e Mon Sep 17 00:00:00 2001 From: matiaslin <45382001+matiaslin@users.noreply.github.com> Date: Sun, 29 Sep 2024 05:25:00 -0700 Subject: [PATCH 03/40] common : ensure llama_batch size does not exceed max size (#9668) A crash was observed when the number of tokens added to a batch exceeds llama_batch size. An assertion in llama_batch_add was added to protect against llama_batch size overflow. --- common/common.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index e2b8574bf..a0611f3d1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1437,6 +1437,8 @@ void llama_batch_add( llama_pos pos, const std::vector & seq_ids, bool logits) { + GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); + batch.token [batch.n_tokens] = id; batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); From 6084bfb261b03f812de2255b05b6b5bb8d1c7171 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 24 Sep 2024 13:23:59 +0300 Subject: [PATCH 04/40] ggml : fix GGML_MAX_N_THREADS + improve formatting (ggml/969) --- ggml/include/ggml.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9f96e0c48..f46d4a8a6 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -229,14 +229,16 @@ #define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 -#ifndef GGML_MAX_NAME -#define GGML_MAX_NAME 64 #define GGML_MAX_N_THREADS 512 - -#endif #define GGML_MAX_OP_PARAMS 64 + +#ifndef GGML_MAX_NAME +# define GGML_MAX_NAME 64 +#endif + #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 + #if UINTPTR_MAX == 0xFFFFFFFF #define GGML_MEM_ALIGN 4 #else @@ -259,21 +261,21 @@ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #ifndef NDEBUG -#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) +# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) #elif defined(__GNUC__) -#define GGML_UNREACHABLE() __builtin_unreachable() +# define GGML_UNREACHABLE() __builtin_unreachable() #elif defined(_MSC_VER) -#define GGML_UNREACHABLE() __assume(0) +# define GGML_UNREACHABLE() __assume(0) #else -#define GGML_UNREACHABLE() ((void) 0) +# define GGML_UNREACHABLE() ((void) 0) #endif #ifdef __cplusplus -#define GGML_NORETURN [[noreturn]] +# define GGML_NORETURN [[noreturn]] #elif defined(_MSC_VER) -#define GGML_NORETURN __declspec(noreturn) +# define GGML_NORETURN __declspec(noreturn) #else -#define GGML_NORETURN _Noreturn +# define GGML_NORETURN _Noreturn #endif #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) From 544f409b4bd8fc98a3e87820f0ac934e00402de7 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Thu, 26 Sep 2024 08:59:42 +0200 Subject: [PATCH 05/40] vulkan : argsort barriers must be under uniform control flow (ggml/951) a return before a barrier (that happens only in some threads in a workgroup) leads to UB. While the old code actually works on some devices, it fails on some others (i.e. "smaller" GPUs). BTW, I think it would be better to set specialization constants when the graph is built, in that way the local workgroup could be sized appropriately. But it would take a lot of work. Signed-off-by: Salvatore Mesoraca --- ggml/src/vulkan-shaders/argsort.comp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ggml/src/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp index e55414b03..d4fa45b1e 100644 --- a/ggml/src/vulkan-shaders/argsort.comp +++ b/ggml/src/vulkan-shaders/argsort.comp @@ -29,20 +29,18 @@ void main() { const int col = int(gl_LocalInvocationID.x); const uint row = gl_WorkGroupID.y; - if (col >= p.ncols_pad) { - return; - } - const uint row_offset = row * p.ncols; // initialize indices - dst_row[col] = col; + if (col < p.ncols_pad) { + dst_row[col] = col; + } barrier(); for (uint k = 2; k <= p.ncols_pad; k *= 2) { for (uint j = k / 2; j > 0; j /= 2) { const uint ixj = col ^ j; - if (ixj > col) { + if (col < p.ncols_pad && ixj > col) { if ((col & k) == 0) { if (dst_row[col] >= p.ncols || (dst_row[ixj] < p.ncols && (p.order == ASC ? From 0de8b203f1d31cf5ee0d2a3560a0ad78d44f4d4c Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 27 Sep 2024 02:58:01 -0500 Subject: [PATCH 06/40] vulkan : fix build for GGML_VULKAN_RUN_TESTS, add TFLOPS to log (ggml/961) --- ggml/src/ggml-vulkan.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index a877145e8..70b729154 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -5013,6 +5013,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } } + ggml_pipeline_allocate_descriptor_sets(ctx->device); + vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -5129,7 +5131,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t avg_err /= m * n; - std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl; + double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.1) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; @@ -5251,12 +5255,14 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_pipeline_request_descriptor_sets(ctx->device, p, 1); + ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; - ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -5383,6 +5389,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } } + ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); @@ -5450,7 +5458,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, avg_err /= m * n; - std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl; + double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0); + + std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.01 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; @@ -5502,9 +5512,6 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) - ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); From 641002fba8d2a0c0269027e23d2ef58e90546028 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 29 Sep 2024 11:50:17 -0500 Subject: [PATCH 07/40] vulkan : multithread pipeline creation (ggml/963) --- ggml/src/ggml-vulkan.cpp | 41 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 70b729154..c677a2728 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -607,13 +609,16 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); -static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { +// variables to track number of compiles in progress +static uint32_t compile_count = 0; +static std::mutex compile_count_mutex; +static std::condition_variable compile_count_cond; + +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - std::lock_guard guard(device->mutex); - pipeline = std::make_shared(); pipeline->name = name; pipeline->parameter_count = parameter_count; @@ -681,7 +686,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co pipeline->layout); pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - device->pipelines.insert({ pipeline->name, pipeline }); + { + std::lock_guard guard(device->mutex); + device->pipelines.insert({ pipeline->name, pipeline }); + } + + { + std::lock_guard guard(compile_count_mutex); + assert(compile_count > 0); + compile_count--; + } + compile_count_cond.notify_all(); } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { @@ -1194,6 +1209,20 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared(); device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); + std::vector> compiles; + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { + { + // wait until fewer than N compiles are in progress + uint32_t N = std::max(1u, std::thread::hardware_concurrency()); + std::unique_lock guard(compile_count_mutex); + while (compile_count >= N) { + compile_count_cond.wait(guard); + } + compile_count++; + } + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align)); + }; + if (device->fp16) { ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); @@ -1743,6 +1772,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); + + for (auto &c : compiles) { + c.wait(); + } } static vk_device ggml_vk_get_device(size_t idx) { From aaa40999251f5d18309b3fddc5a7d576f5fdb4e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 29 Sep 2024 19:56:17 +0200 Subject: [PATCH 08/40] CUDA: remove bad assert (ggml/972) --- ggml/src/ggml-cuda/im2col.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 3d0d8d4e6..16463ab0f 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -69,7 +69,6 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); From d0b1d663e430354ab35853a6e1bce51cc8819376 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Sep 2024 21:16:07 +0300 Subject: [PATCH 09/40] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 36eeed0cc..aa301462a 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -336c10a4c3c8ec99af484b25a0cddd397a09cdb2 +9a24b8c8c40eab7262d067e91d08df160678df8d From c919d5db39c8a7fcb64737f008e4b105ee0acd20 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Sep 2024 21:18:23 +0300 Subject: [PATCH 10/40] ggml : define missing HWCAP flags (#9684) ggml-ci Co-authored-by: Willy Tarreau --- ggml/src/ggml.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index fac4466e3..81b651c6a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3687,6 +3687,10 @@ static inline int ggml_up(int n, int m) { #include #endif +#if !defined(HWCAP2_I8MM) +#define HWCAP2_I8MM 0 +#endif + static void ggml_init_arm_arch_features(void) { #if defined(__linux__) && defined(__aarch64__) uint32_t hwcap = getauxval(AT_HWCAP); From 8277a817f18967581b02b2248989d773e8e99998 Mon Sep 17 00:00:00 2001 From: Ruchira Hasaranga Date: Mon, 30 Sep 2024 13:53:42 +0530 Subject: [PATCH 11/40] console : utf-8 fix for windows stdin (#9690) * utf-8 fix for windows stdin * Update common/console.cpp --------- Co-authored-by: Georgi Gerganov --- common/console.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/console.cpp b/common/console.cpp index f65cbc6ed..078a8d678 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -94,6 +94,9 @@ namespace console { simple_io = true; } } + if (simple_io) { + _setmode(_fileno(stdin), _O_U8TEXT); + } #else // POSIX-specific console initialization if (!simple_io) { From ace4f4be37abed4801fbd54a94cf38a7ae462416 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Sep 2024 17:48:49 +0300 Subject: [PATCH 12/40] flake.lock: Update (#9680) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/c04d5652cfa9742b1d519688f65d1bbccea9eb7e?narHash=sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK%2BWk%3D' (2024-09-19) → 'github:NixOS/nixpkgs/1925c603f17fc89f4c8f6bf6f631a802ad85d784?narHash=sha256-J%2BPeFKSDV%2BpHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI%3D' (2024-09-26) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 6333a09f0..dde1ab527 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1726755586, - "narHash": "sha256-PmUr/2GQGvFTIJ6/Tvsins7Q43KTMvMFhvG6oaYK+Wk=", + "lastModified": 1727348695, + "narHash": "sha256-J+PeFKSDV+pHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "c04d5652cfa9742b1d519688f65d1bbccea9eb7e", + "rev": "1925c603f17fc89f4c8f6bf6f631a802ad85d784", "type": "github" }, "original": { From 08a43d05b6ba74de97610ae519450ad9996475e0 Mon Sep 17 00:00:00 2001 From: vb Date: Mon, 30 Sep 2024 17:03:47 +0200 Subject: [PATCH 13/40] py : update transfomers version (#9694) * update transfomers version. * update hfh version. --- examples/server/tests/requirements.txt | 2 +- requirements/requirements-convert_legacy_llama.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index f2d7e5c57..553954872 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 behave~=1.2.6 -huggingface_hub~=0.20.3 +huggingface_hub~=0.23.2 numpy~=1.26.4 openai~=1.30.3 prometheus-client~=0.20.0 diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 1d07b0952..859204b27 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,5 +1,5 @@ numpy~=1.26.4 sentencepiece~=0.2.0 -transformers>=4.40.1,<5.0.0 +transformers>=4.45.1,<5.0.0 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 From 511636df0c90826b4dd1fc21ff260c19d69a3b5d Mon Sep 17 00:00:00 2001 From: compilade Date: Mon, 30 Sep 2024 14:13:16 -0400 Subject: [PATCH 14/40] ci : reduce severity of unused Pyright ignore comments (#9697) --- .github/workflows/python-type-check.yml | 4 +++- examples/llava/convert_image_encoder_to_gguf.py | 4 ++-- pyrightconfig.json | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index e5ff5e6d7..373bb6010 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -4,11 +4,13 @@ on: push: paths: - '.github/workflows/python-type-check.yml' + - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' pull_request: paths: - '.github/workflows/python-type-check.yml' + - 'pyrightconfig.json' - '**.py' - '**/requirements*.txt' @@ -33,6 +35,6 @@ jobs: - name: Type-check with Pyright uses: jakebailey/pyright-action@v2 with: - version: 1.1.370 + version: 1.1.382 level: warning warnings: true diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index 36f6b92fb..4fa1d6cea 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] + model.vision_model.encoder.layers.pop(-1) projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) @@ -288,7 +288,7 @@ if has_llava_projector: print("Projector tensors added\n") -state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] +state_dict = model.state_dict() for name, data in state_dict.items(): if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): # we don't need this diff --git a/pyrightconfig.json b/pyrightconfig.json index 6016f4b6d..9acbbeb78 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -5,7 +5,8 @@ "reportUnusedImport": "warning", "reportDuplicateImport": "error", "reportDeprecated": "warning", - "reportUnnecessaryTypeIgnoreComment": "warning", + "reportUnnecessaryTypeIgnoreComment": "information", + "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum "executionEnvironments": [ { // TODO: make this version override work correctly From 6f1d9d71f4c568778a7637ff6582e6f6ba5fb9d3 Mon Sep 17 00:00:00 2001 From: serhii-nakon <57632032+serhii-nakon@users.noreply.github.com> Date: Mon, 30 Sep 2024 21:57:12 +0300 Subject: [PATCH 15/40] Fix Docker ROCM builds, use AMDGPU_TARGETS instead of GPU_TARGETS (#9641) * Fix Docker ROCM builds, use AMDGPU_TARGETS instead of GPU_TARGETS * Set ROCM_DOCKER_ARCH as string due it incorrectly build and cause OOM exit code --- .devops/full-rocm.Dockerfile | 6 +++--- .devops/llama-cli-rocm.Dockerfile | 6 +++--- .devops/llama-server-rocm.Dockerfile | 6 +++--- .github/workflows/build.yml | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 680d1cb92..df496bcd2 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index c3d1ab067..e60c747bd 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index fd0e19ad6..8553af75b 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH=\ +ARG ROCM_DOCKER_ARCH="\ gfx803 \ gfx900 \ gfx906 \ @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\ gfx1030 \ gfx1100 \ gfx1101 \ - gfx1102 + gfx1102" COPY requirements.txt requirements.txt COPY requirements requirements @@ -34,7 +34,7 @@ WORKDIR /app COPY . . # Set nvcc architecture -ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e6a977b60..c71d422e7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1032,7 +1032,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" From 1927378bcce20ba72b6c89d5977b854a4bcaeb5d Mon Sep 17 00:00:00 2001 From: compilade Date: Tue, 1 Oct 2024 02:31:36 -0400 Subject: [PATCH 16/40] convert : refactor rope_freqs generation (#9396) * convert : refactor rope_freqs generation This should also fix vocab-only conversion for Phi-3. * convert : adapt MiniCPM3 to separate rope_freqs insertion MiniCPM3's tokenizer is treated as a SentencePiece tokenizer to avoid having to run its custom Python code which mixes tokenization in the same file as tool calls. gguf-py : add long and short RoPE factors to tensor mappings Empty, but the key names are used to populate the mappings. --- convert_hf_to_gguf.py | 61 ++++++++++++++++++---------------- convert_lora_to_gguf.py | 5 ++- gguf-py/gguf/constants.py | 4 +++ gguf-py/gguf/tensor_mapping.py | 3 ++ 4 files changed, 44 insertions(+), 29 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f3857d487..da5feb25b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -15,6 +15,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from itertools import chain import math import numpy as np @@ -64,7 +65,6 @@ class Model: model_name: str | None metadata_override: Path | None dir_model_card: Path - is_lora: bool # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -72,7 +72,7 @@ class Model: def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False): + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -94,7 +94,6 @@ class Model: self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -270,10 +269,14 @@ class Model: return False + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - for name, data_torch in self.get_tensors(): + for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue @@ -1617,7 +1620,7 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -1644,9 +1647,9 @@ class LlamaModel(Model): smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: @@ -1870,8 +1873,6 @@ class MiniCPM3Model(Model): def set_gguf_parameters(self): hparams = self.hparams - rope_dims = hparams["qk_rope_head_dim"] - self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) @@ -1887,24 +1888,25 @@ class MiniCPM3Model(Model): self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: - return + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_llama_hf() + self._set_vocab_sentencepiece() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2216,6 +2218,13 @@ class Phi3MiniModel(Model): self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) if rope_scaling is None: @@ -2245,9 +2254,8 @@ class Phi3MiniModel(Model): if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - if not self.is_lora: - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) - self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) @Model.register("PlamoForCausalLM") @@ -4071,7 +4079,7 @@ class ExaoneModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -4098,10 +4106,7 @@ class ExaoneModel(Model): smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) - - super().prepare_tensors() + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) @Model.register("GraniteForCausalLM") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index d1c94e580..439a78de1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -331,6 +331,10 @@ if __name__ == '__main__': self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) super().set_gguf_parameters() + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Never add extra tensors (e.g. rope_freqs) for LoRA adapters + return () + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -392,7 +396,6 @@ if __name__ == '__main__': dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, - is_lora=True, ) logger.info("Exporting model...") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ebe66a4a3..e08617ba2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -814,6 +814,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, @@ -892,6 +894,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q_A, MODEL_TENSOR.ATTN_Q_B, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index e7e9b6fd5..f4a787c56 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -87,6 +87,9 @@ class TensorNameMap: "rope.freqs", # llama-pth "rotary_pos_emb.inv_freq", # chatglm ), + + MODEL_TENSOR.ROPE_FACTORS_LONG: (), + MODEL_TENSOR.ROPE_FACTORS_SHORT: (), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { From a90484c6d9db699bf739d0f33daf1c50cbdd45c9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Oct 2024 11:42:01 +0300 Subject: [PATCH 17/40] llama : print correct model type for Llama 3.2 1B and 3B --- src/llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index c466cd88b..d1d27d21e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5502,8 +5502,10 @@ static void llm_load_hparams( } } else { switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; + case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B // granite uses a vocab with len 49152 case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; case 36: model.type = e_model::MODEL_8B; break; // granite From cad341d88924788b940997c55e7bef000c99e784 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Oct 2024 16:00:25 +0300 Subject: [PATCH 18/40] metal : reduce command encoding overhead (#9698) * metal : reduce command encoding overhead ggml-ci * metal : add comments --- examples/cvector-generator/pca.hpp | 7 - examples/llava/clip.cpp | 6 - ggml/include/ggml-metal.h | 5 - ggml/src/ggml-metal.m | 3888 ++++++++++++++-------------- src/llama.cpp | 6 - 5 files changed, 2000 insertions(+), 1912 deletions(-) diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index a969c486d..f6e307fbc 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -204,13 +204,6 @@ static ggml_status compute_piter( ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } -// TODO: enable GPU support when support for GGML_OP_SQRT is added -//#ifdef GGML_USE_METAL -// if (ggml_backend_is_metal(model.backend)) { -// ggml_backend_metal_set_n_cb(model.backend, params.n_threads); -// } -//#endif - ggml_status res = ggml_backend_graph_compute(model.backend, gf); if (res == GGML_STATUS_SUCCESS) { auto extract_i = [](std::string prefix, std::string str) -> int { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8aa7b0750..14e02c8dd 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2444,12 +2444,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(ctx->backend)) { - ggml_backend_metal_set_n_cb(ctx->backend, n_threads); - } -#endif - ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index d483cf1ac..4d4165324 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -25,9 +25,6 @@ #include #include -// max memory buffers that can be mapped to the device -#define GGML_METAL_MAX_BUFFERS 64 - struct ggml_tensor; struct ggml_cgraph; @@ -48,8 +45,6 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); -GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); - GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index ef3b7f0e8..9da08fe2e 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -12,6 +12,12 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// max memory buffers that can be mapped to the device +#define GGML_METAL_MAX_BUFFERS 64 + +// max number of MTLCommandBuffer used to submit a graph for processing +#define GGML_METAL_MAX_COMMAND_BUFFERS 8 + #ifdef GGML_METAL_NDEBUG #define GGML_METAL_LOG(...) #define GGML_METAL_LOG_INFO(...) @@ -221,11 +227,11 @@ enum ggml_metal_kernel_type { }; struct ggml_backend_metal_context { - int n_cb; - id device; id queue; + MTLComputePassDescriptor * edesc; + dispatch_queue_t d_queue; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; @@ -233,7 +239,27 @@ struct ggml_backend_metal_context { bool support_simdgroup_reduction; bool support_simdgroup_mm; - bool should_capture_next_compute; + // capture state + bool capture_next_compute; + bool capture_started; + + id capture_scope; + + // command buffer state + int n_cb; // number of extra threads used to submit the command buffers + int n_nodes_0; // number of nodes submitted by the main thread + int n_nodes_1; // remaining number of nodes submitted by the n_cb threads + int n_nodes_per_cb; + + struct ggml_cgraph * gf; + + // the callback given to the thread pool + // TODO: ideally, this should be created once, utilizing the command buffer state above + // for some reason, doing it like this leads to a crash + void (^encode_async)(size_t ith); + + // n_cb command buffers + 1 used by the main thread + id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -303,7 +329,7 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } -static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { +static struct ggml_backend_metal_context * ggml_metal_init(void) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG @@ -322,8 +348,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { // Configure context struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); ctx->device = device; - ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; + ctx->edesc = MTLComputePassDescriptor.computePassDescriptor; + ctx->edesc.dispatchType = MTLDispatchTypeSerial; ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -455,7 +482,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); - ctx->should_capture_next_compute = false; + ctx->capture_next_compute = false; + ctx->capture_started = false; + ctx->capture_scope = nil; + + ctx->gf = nil; + ctx->encode_async = nil; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + ctx->command_buffers[i] = nil; + } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { @@ -686,6 +721,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { } [metal_library release]; + return ctx; } @@ -874,874 +910,820 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx } } -static enum ggml_status ggml_metal_graph_compute( - struct ggml_backend_metal_context * ctx, - struct ggml_cgraph * gf) { +static void ggml_metal_encode_node( + struct ggml_backend_metal_context * ctx, + int idx, + id encoder) { + struct ggml_cgraph * gf = ctx->gf; - @autoreleasepool { - MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - edesc.dispatchType = MTLDispatchTypeSerial; + struct ggml_tensor * node = ggml_graph_node(gf, idx); - // create multiple command buffers and enqueue them - // then, we encode the graph into the command buffers in parallel + //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op)); - const int n_nodes = gf->n_nodes; - const int n_cb = ctx->n_cb; - const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + struct ggml_tensor * src0 = node->src[0]; + struct ggml_tensor * src1 = node->src[1]; + struct ggml_tensor * src2 = node->src[2]; + struct ggml_tensor * dst = node; - const bool should_capture = ctx->should_capture_next_compute; - if (should_capture) { - ctx->should_capture_next_compute = false; - - MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; - descriptor.captureObject = ctx->queue; - - NSError * error = nil; - if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { - GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ABORT("capture failed"); - } + if (ggml_is_empty(dst)) { + return; } - id command_buffer_builder[n_cb]; - for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - command_buffer_builder[cb_idx] = command_buffer; - - // always enqueue the first two command buffers - // enqueue all of the command buffers if we don't need to abort - if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; - } + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop -> next node + } return; + default: + { + } break; } - const id *command_buffers = command_buffer_builder; + if (!ggml_metal_supports_op(ctx, dst)) { + GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); + GGML_ABORT("unsupported op"); + } - dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) { - const int cb_idx = iter; + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; - size_t offs_src0 = 0; - size_t offs_src1 = 0; - size_t offs_src2 = 0; - size_t offs_dst = 0; + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; - id command_buffer = command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; - const int node_start = (cb_idx + 0) * n_nodes_per_cb; - const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; - for (int i = node_start; i < node_end; ++i) { - if (i == -1) { - [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; - continue; - } + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22); + const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; - struct ggml_tensor * src0 = gf->nodes[i]->src[0]; - struct ggml_tensor * src1 = gf->nodes[i]->src[1]; - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; - struct ggml_tensor * dst = gf->nodes[i]; + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; - if (ggml_is_empty(dst)) { - continue; - } + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; - switch (dst->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - { - // noop -> next node - } continue; + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_src2 = 0; + size_t offs_dst = 0; + + id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; + id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; + id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; + + //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //if (src0) { + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} + + switch (dst->op) { + case GGML_OP_CONCAT: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + + const int32_t dim = ((const int32_t *) dst->op_params)[0]; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:27]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + const size_t offs = 0; + + bool bcast_row = false; + + int64_t nb = ne00; // used by the "row" kernels + + id pipeline = nil; + + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // src1 is a row + GGML_ASSERT(ne11 == 1); + + nb = ne00 / 4; + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + bcast_row = true; + } else { + switch (dst->op) { + case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; + case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; + case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; + default: GGML_ABORT("fatal error"); + } + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; + + if (bcast_row) { + const int64_t n = ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } else { + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + } break; + case GGML_OP_REPEAT: + { + id pipeline; + + switch (src0t) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; + case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ACC: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + GGML_ASSERT(dstt == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const size_t pnb1 = ((const int32_t *) dst->op_params)[0]; + const size_t pnb2 = ((const int32_t *) dst->op_params)[1]; + const size_t pnb3 = ((const int32_t *) dst->op_params)[2]; + const size_t offs = ((const int32_t *) dst->op_params)[3]; + + const bool inplace = (bool) ((const int32_t *) dst->op_params)[4]; + + if (!inplace) { + // run a separete kernel to cpy src->dst + // not sure how to avoid this + // TODO: make a simpler cpy_bytes kernel + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; + [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; + [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; + [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_SCALE: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + float scale; + memcpy(&scale, dst->op_params, sizeof(scale)); + + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + n /= 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_CLAMP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; + + float min; + float max; + memcpy(&min, ((const int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&max, ((const int32_t *) dst->op_params) + 1, sizeof(float)); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&min length:sizeof(min) atIndex:2]; + [encoder setBytes:&max length:sizeof(max) atIndex:3]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + // we are not taking into account the strides, so for now require contiguous tensors + GGML_ASSERT(ggml_is_contiguous(src0)); + + case GGML_UNARY_OP_TANH: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_RELU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SIGMOID: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_SILU: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; default: - { - } break; - } - - if (!ggml_metal_supports_op(ctx, dst)) { - GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ABORT("unsupported op"); - } - - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; - } - - const int64_t ne00 = src0 ? src0->ne[0] : 0; - const int64_t ne01 = src0 ? src0->ne[1] : 0; - const int64_t ne02 = src0 ? src0->ne[2] : 0; - const int64_t ne03 = src0 ? src0->ne[3] : 0; - - const uint64_t nb00 = src0 ? src0->nb[0] : 0; - const uint64_t nb01 = src0 ? src0->nb[1] : 0; - const uint64_t nb02 = src0 ? src0->nb[2] : 0; - const uint64_t nb03 = src0 ? src0->nb[3] : 0; - - const int64_t ne10 = src1 ? src1->ne[0] : 0; - const int64_t ne11 = src1 ? src1->ne[1] : 0; - const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; - - const uint64_t nb10 = src1 ? src1->nb[0] : 0; - const uint64_t nb11 = src1 ? src1->nb[1] : 0; - const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; - - const int64_t ne20 = src2 ? src2->ne[0] : 0; - const int64_t ne21 = src2 ? src2->ne[1] : 0; - const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22); - const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - - const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); - const uint64_t nb21 = src2 ? src2->nb[1] : 0; - const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; - - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; - - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; - id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; - id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; - - //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} - - switch (dst->op) { - case GGML_OP_CONCAT: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; - - const int32_t dim = ((int32_t *) dst->op_params)[0]; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&dim length:sizeof(dim) atIndex:27]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - - const size_t offs = 0; - - bool bcast_row = false; - - int64_t nb = ne00; // used by the "row" kernels - - id pipeline = nil; - - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - GGML_ASSERT(ggml_is_contiguous(src0)); - - // src1 is a row - GGML_ASSERT(ne11 == 1); - - nb = ne00 / 4; - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - bcast_row = true; - } else { - switch (dst->op) { - case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; - case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break; - case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; - case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ABORT("fatal error"); - } - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:28]; - - if (bcast_row) { - const int64_t n = ggml_nelements(dst)/4; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } else { - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } - } break; - case GGML_OP_REPEAT: - { - id pipeline; - - switch (src0t) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; - case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; - case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ACC: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - GGML_ASSERT(dstt == GGML_TYPE_F32); - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - const size_t pnb1 = ((int32_t *) dst->op_params)[0]; - const size_t pnb2 = ((int32_t *) dst->op_params)[1]; - const size_t pnb3 = ((int32_t *) dst->op_params)[2]; - const size_t offs = ((int32_t *) dst->op_params)[3]; - - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - // run a separete kernel to cpy src->dst - // not sure how to avoid this - // TODO: make a simpler cpy_bytes kernel - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; - [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24]; - [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25]; - [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; - [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_SCALE: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - float scale; - memcpy(&scale, dst->op_params, sizeof(scale)); - - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - n /= 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_CLAMP: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; - - float min; - float max; - memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); - memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&min length:sizeof(min) atIndex:2]; - [encoder setBytes:&max length:sizeof(max) atIndex:3]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(gf->nodes[i])) { - // we are not taking into account the strides, so for now require contiguous tensors - GGML_ASSERT(ggml_is_contiguous(src0)); - - case GGML_UNARY_OP_TANH: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_RELU: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_SIGMOID: - { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_GELU_QUICK: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_UNARY_OP_SILU: - { - int64_t n = ggml_nelements(dst); - - id pipeline = nil; - - if (n % 4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline; - n /= 4; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); - } - } break; - case GGML_OP_SQR: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SQRT: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SIN: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_COS: - { - GGML_ASSERT(ggml_is_contiguous(src0)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SUM_ROWS: - { - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SOFT_MAX: - { - GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); - - int nth = 32; // SIMD width - - id pipeline = nil; - - const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); - - if (ne00%4 == 0) { - while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) { - nth *= 2; - } - if (use_f16) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; - } - } else { - while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { - nth *= 2; - } - if (use_f16) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline; - } - } - - float scale; - float max_bias; - - memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); - memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); - - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - const uint32_t n_head = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; - [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:8]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:9]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_DIAG_MASK_INF: - { - const int n_past = ((int32_t *)(dst->op_params))[0]; - - id pipeline = nil; - - if (ne00%8 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - - if (ne00%8 == 0) { - [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - else { - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } - } break; - case GGML_OP_SSM_CONV: - { - GGML_ASSERT(src0t == GGML_TYPE_F32); - GGML_ASSERT(src1t == GGML_TYPE_F32); - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SSM_SCAN: - { - struct ggml_tensor * src3 = gf->nodes[i]->src[3]; - struct ggml_tensor * src4 = gf->nodes[i]->src[4]; - struct ggml_tensor * src5 = gf->nodes[i]->src[5]; - - GGML_ASSERT(src3); - GGML_ASSERT(src4); - GGML_ASSERT(src5); - - size_t offs_src3 = 0; - size_t offs_src4 = 0; - size_t offs_src5 = 0; - - id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; - id id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil; - id id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil; - - const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30); - const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31); - - const uint64_t nb30 = src3->nb[0]; - const uint64_t nb31 = src3->nb[1]; - - const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40); - const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41); - const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42); - - const uint64_t nb40 = src4->nb[0]; - const uint64_t nb41 = src4->nb[1]; - const uint64_t nb42 = src4->nb[2]; - - const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50); - const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51); - const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52); - - const uint64_t nb50 = src5->nb[0]; - const uint64_t nb51 = src5->nb[1]; - const uint64_t nb52 = src5->nb[2]; - - const int64_t d_state = ne00; - const int64_t d_inner = ne01; - const int64_t n_seq_tokens = ne11; - const int64_t n_seqs = ne02; - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; - [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; - [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; - - [encoder setBytes:&d_state length:sizeof(d_state) atIndex:7]; - [encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8]; - [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9]; - [encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10]; - - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; - [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20]; - [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21]; - [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22]; - [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23]; - [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24]; - [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25]; - [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26]; - [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27]; - [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28]; - - [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_MUL_MAT: - { - GGML_ASSERT(ne00 == ne10); - - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); - - const uint r2 = ne12/ne02; - const uint r3 = ne13/ne03; - - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - int ne11_mm_min = 1; + { + GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_SQR: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SQRT: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SIN: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_COS: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SUM_ROWS: + { + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:19]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:20]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:21]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:22]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:23]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:24]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:25]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SOFT_MAX: + { + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); + + int nth = 32; // SIMD width + + id pipeline = nil; + + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); + + if (ne00%4 == 0) { + while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) { + nth *= 2; + } + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; + } + } else { + while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { + nth *= 2; + } + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline; + } + } + + float scale; + float max_bias; + + memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias)); + + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + + const uint32_t n_head = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (id_src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:8]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:9]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_DIAG_MASK_INF: + { + const int n_past = ((const int32_t *)(dst->op_params))[0]; + + id pipeline = nil; + + if (ne00%8 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; + + if (ne00%8 == 0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + else { + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + } break; + case GGML_OP_SSM_CONV: + { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SSM_SCAN: + { + struct ggml_tensor * src3 = node->src[3]; + struct ggml_tensor * src4 = node->src[4]; + struct ggml_tensor * src5 = node->src[5]; + + GGML_ASSERT(src3); + GGML_ASSERT(src4); + GGML_ASSERT(src5); + + size_t offs_src3 = 0; + size_t offs_src4 = 0; + size_t offs_src5 = 0; + + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; + id id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil; + id id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil; + + const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30); + const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31); + + const uint64_t nb30 = src3->nb[0]; + const uint64_t nb31 = src3->nb[1]; + + const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40); + const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41); + const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42); + + const uint64_t nb40 = src4->nb[0]; + const uint64_t nb41 = src4->nb[1]; + const uint64_t nb42 = src4->nb[2]; + + const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50); + const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51); + const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52); + + const uint64_t nb50 = src5->nb[0]; + const uint64_t nb51 = src5->nb[1]; + const uint64_t nb52 = src5->nb[2]; + + const int64_t d_state = ne00; + const int64_t d_inner = ne01; + const int64_t n_seq_tokens = ne11; + const int64_t n_seqs = ne02; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; + [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; + + [encoder setBytes:&d_state length:sizeof(d_state) atIndex:7]; + [encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8]; + [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9]; + [encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10]; + + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17]; + [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20]; + [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21]; + [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22]; + [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23]; + [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24]; + [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25]; + [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26]; + [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27]; + [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28]; + + [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL_MAT: + { + GGML_ASSERT(ne00 == ne10); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + const uint r2 = ne12/ne02; + const uint r3 = ne13/ne03; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 1; #if 0 - // the numbers below are measured on M2 Ultra for 7B and 13B models - // these numbers do not translate to other devices or model sizes - // TODO: need to find a better approach + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { switch (src0t) { case GGML_TYPE_F16: ne11_mm_min = 2; break; @@ -1763,11 +1745,11 @@ static enum ggml_status ggml_metal_graph_compute( // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - !ggml_is_transposed(src0) && - !ggml_is_transposed(src1) && - src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + !ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + ne00 % 32 == 0 && ne00 >= 64 && + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); // some Metal matrix data types require aligned pointers @@ -2001,8 +1983,8 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -2036,1041 +2018,1158 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } - } break; - case GGML_OP_MUL_MAT_ID: - { - const int n_as = src0->ne[2]; - - // src2 = ids - const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); - - GGML_ASSERT(src2t == GGML_TYPE_I32); - - GGML_ASSERT(!ggml_is_transposed(src0)); - GGML_ASSERT(!ggml_is_transposed(src1)); - - GGML_ASSERT(src1t == GGML_TYPE_F32); - - // find the break-even point where the matrix-matrix kernel becomes more efficient compared - // to the matrix-vector kernel - // ne20 = n_used_experts - // ne21 = n_rows - const int dst_rows = ne20*ne21; - const int dst_rows_min = n_as; - const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4; - - // max size of the rowids array in the kernel shared buffer - GGML_ASSERT(dst_rows <= dst_rows_max); - - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - // !!! - // TODO: for now, always use mat-vec kernels until we figure out how to improve the - // indirect matrix multiplication - // !!! - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne00 % 32 == 0 && ne00 >= 64 && - dst_rows > dst_rows_min) { - - // some Metal matrix data types require aligned pointers - // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; - default: break; - } - - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; - default: GGML_ABORT("MUL_MAT_ID not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:8]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; - - [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - id pipeline = nil; - - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; - } break; - case GGML_TYPE_F16: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - nth0 = 32; - nth1 = 1; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; - } break; - case GGML_TYPE_IQ2_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_M: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; - } break; - case GGML_TYPE_IQ4_NL: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; - } break; - case GGML_TYPE_IQ4_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; - } break; - default: - { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ABORT("not implemented"); - } - }; - - if (ggml_is_quantized(src0t)) { - GGML_ASSERT(ne00 >= nth0*nth1); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:20]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:21]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:22]; - - const int64_t _ne1 = 1; - const int tgz = dst_rows; - - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { - const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { - const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { - const int mem_size = 32*sizeof(float); - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q3_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1 - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } - } break; - case GGML_OP_GET_ROWS: - { - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; - case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ABORT("not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; - } break; - case GGML_OP_RMS_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ggml_is_contiguous_1(src0)); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - int nth = 32; // SIMD width - - while (nth < ne00/4 && nth < 1024) { - nth *= 2; - } - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_GROUP_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ggml_is_contiguous(src0)); - - float eps; - memcpy(&eps, dst->op_params + 1, sizeof(float)); - - const int32_t n_groups = ((int32_t *) dst->op_params)[0]; - - int nth = 32; // SIMD width - - //while (nth < ne00/4 && nth < 1024) { - // nth *= 2; - //} - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&eps length:sizeof( float) atIndex:9]; - [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_NORM: - { - GGML_ASSERT(ggml_is_contiguous_1(src0)); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - const int nth = MIN(256, ne00); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; - - const int64_t nrows = ggml_nrows(src0); - - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ROPE: - { - GGML_ASSERT(ne10 == ne02); - - const int nth = MIN(1024, ne00); - - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - - id pipeline = nil; - - if (!is_neox) { - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - } else { - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - if (id_src2 != nil) { - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; - [encoder setBytes:&n_ctx_orig length:sizeof( int) atIndex:22]; - [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; - [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; - [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; - [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; - [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; - [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_IM2COL: - { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int32_t N = src1->ne[is_2D ? 3 : 2]; - const int32_t IC = src1->ne[is_2D ? 2 : 1]; - const int32_t IH = is_2D ? src1->ne[1] : 1; - const int32_t IW = src1->ne[0]; - - const int32_t KH = is_2D ? src0->ne[1] : 1; - const int32_t KW = src0->ne[0]; - - const int32_t OH = is_2D ? dst->ne[2] : 1; - const int32_t OW = dst->ne[1]; - - const int32_t CHW = IC * KH * KW; - - const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; - const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; - - id pipeline = nil; - - switch (dst->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; - [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; - [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; - [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; - [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; - [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; - [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; - [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; - [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; - [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; - [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; - - [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; - } break; - case GGML_OP_UPSCALE: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const float sf0 = (float)ne0/src0->ne[0]; - const float sf1 = (float)ne1/src0->ne[1]; - const float sf2 = (float)ne2/src0->ne[2]; - const float sf3 = (float)ne3/src0->ne[3]; - - const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18]; - [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19]; - [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20]; - [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21]; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_PAD: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARANGE: - { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - float start; - float step; - - memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float)); - memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:0]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1]; - [encoder setBytes:&start length:sizeof(start) atIndex:2]; - [encoder setBytes:&step length:sizeof(step) atIndex:3]; - - const int nth = MIN(1024, ne0); - - [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - - const int half = dim / 2; - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2]; - [encoder setBytes:&dim length:sizeof(dim) atIndex:3]; - [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4]; - - const int nth = MIN(1024, half); - - [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ARGSORT: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - const int nrows = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - // bitonic sort requires the number of elements to be power of 2 - int64_t ne00_padded = 1; - while (ne00_padded < ne00) { - ne00_padded *= 2; - } - - // Metal kernels require the buffer size to be multiple of 16 bytes - // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength - const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16); - - id pipeline = nil; - - switch (order) { - case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ABORT("fatal error"); - }; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3]; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)]; - } break; - case GGML_OP_LEAKY_RELU: - { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - float slope; - memcpy(&slope, dst->op_params, sizeof(float)); - - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; - - const int64_t n = ggml_nelements(dst); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - GGML_ASSERT(ne00 % 4 == 0); - GGML_ASSERT(ne11 % 32 == 0); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - GGML_ASSERT(ggml_are_same_shape (src1, src2)); - - struct ggml_tensor * src3 = gf->nodes[i]->src[3]; - - size_t offs_src3 = 0; - - id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; - - GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16); - GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && - "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); - - const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); - //const int64_t ne31 = src3 ? src3->ne[1] : 0; - const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); - const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); - - const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30); - const uint64_t nb31 = src3 ? src3->nb[1] : 0; - const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32); - const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33); - - const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); - - float scale; - float max_bias; - float logit_softcap; - memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); - memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); - memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap)); - - if (logit_softcap != 0.0f) { - scale /= logit_softcap; - } - - const uint32_t n_head = src0->ne[2]; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - id pipeline = nil; - - bool use_vec_kernel = false; - - if (ne01 >= 4 || (ne00%128 != 0)) { - switch (ne00) { - case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; - case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; - case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; - case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; - default: - { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } - } - } else { - use_vec_kernel = true; - - switch (ne00) { - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; - default: - { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } - } - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - if (id_src3) { - [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; - } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3]; - } - [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; - [encoder setBytes:&scale length:sizeof( float) atIndex:23]; - [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; - [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28]; - - if (!use_vec_kernel) { - // half8x8 kernel - const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! - const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! - - GGML_ASSERT(nqptg <= 32); - GGML_ASSERT(nqptg % 8 == 0); - GGML_ASSERT(ncpsg % 32 == 0); - - int64_t nsgmax = 2; - - while (true) { - const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); - if (smem > ctx->device.maxThreadgroupMemoryLength) { - break; - } - nsgmax *= 2; - } - nsgmax /= 2; - - // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; - - const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); - - //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); - GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); - - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; - } else { - // half1x4 kernel - const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! - const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! - - GGML_ASSERT(nqptg <= 32); - GGML_ASSERT(nqptg % 1 == 0); - GGML_ASSERT(ncpsg % 32 == 0); - - // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); - - int64_t nsg = 1; - while (nsg <= nsgt) { - nsg *= 2; - } - nsg /= 2; - - const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); - - //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); - GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; - } - } break; - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - { - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); - - id pipeline = nil; - - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); - - switch (dstt) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; - default: GGML_ABORT("not implemented"); - }; - } break; - case GGML_TYPE_F16: - { - switch (dstt) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - default: GGML_ABORT("not implemented"); - }; - } break; - default: GGML_ABORT("not implemented"); - } - - [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - default: - { - GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); + } break; + case GGML_OP_MUL_MAT_ID: + { + const int n_as = src0->ne[2]; + + // src2 = ids + const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); + + GGML_ASSERT(src2t == GGML_TYPE_I32); + + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src1t == GGML_TYPE_F32); + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + // ne20 = n_used_experts + // ne21 = n_rows + const int dst_rows = ne20*ne21; + const int dst_rows_min = n_as; + const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4; + + // max size of the rowids array in the kernel shared buffer + GGML_ASSERT(dst_rows <= dst_rows_max); + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + // !!! + // TODO: for now, always use mat-vec kernels until we figure out how to improve the + // indirect matrix multiplication + // !!! + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne00 % 32 == 0 && ne00 >= 64 && + dst_rows > dst_rows_min) { + + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; } - } - if (should_capture) { - [encoder popDebugGroup]; + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; + default: GGML_ABORT("MUL_MAT_ID not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; + + [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + + id pipeline = nil; + + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; + } break; + case GGML_TYPE_IQ2_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; + } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + } break; + case GGML_TYPE_IQ4_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; + } break; + default: + { + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); + GGML_ABORT("not implemented"); + } + }; + + if (ggml_is_quantized(src0t)) { + GGML_ASSERT(ne00 >= nth0*nth1); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:21]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:22]; + + const int64_t _ne1 = 1; + const int tgz = dst_rows; + + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1 + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_GET_ROWS: + { + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; + default: GGML_ABORT("not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + } break; + case GGML_OP_RMS_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + int nth = 32; // SIMD width + + while (nth < ne00/4 && nth < 1024) { + nth *= 2; + } + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_GROUP_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ggml_is_contiguous(src0)); + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + + const int32_t n_groups = ((const int32_t *) dst->op_params)[0]; + + int nth = 32; // SIMD width + + //while (nth < ne00/4 && nth < 1024) { + // nth *= 2; + //} + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&eps length:sizeof( float) atIndex:9]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_NORM: + { + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + const int nth = MIN(256, ne00); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ROPE: + { + GGML_ASSERT(ne10 == ne02); + + const int nth = MIN(1024, ne00); + + const int n_past = ((const int32_t *) dst->op_params)[0]; + const int n_dims = ((const int32_t *) dst->op_params)[1]; + const int mode = ((const int32_t *) dst->op_params)[2]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_ctx_orig = ((const int32_t *) dst->op_params)[4]; + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (const int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (const int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (const int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (const int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float)); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + id pipeline = nil; + + if (!is_neox) { + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } else { + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + if (id_src2 != nil) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_ctx_orig length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_IM2COL: + { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int32_t N = src1->ne[is_2D ? 3 : 2]; + const int32_t IC = src1->ne[is_2D ? 2 : 1]; + const int32_t IH = is_2D ? src1->ne[1] : 1; + const int32_t IW = src1->ne[0]; + + const int32_t KH = is_2D ? src0->ne[1] : 1; + const int32_t KW = src0->ne[0]; + + const int32_t OH = is_2D ? dst->ne[2] : 1; + const int32_t OW = dst->ne[1]; + + const int32_t CHW = IC * KH * KW; + + const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; + const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + + id pipeline = nil; + + switch (dst->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; + [encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3]; + [encoder setBytes:&IW length:sizeof( int32_t) atIndex:4]; + [encoder setBytes:&IH length:sizeof( int32_t) atIndex:5]; + [encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6]; + [encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7]; + [encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8]; + [encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9]; + [encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10]; + [encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11]; + [encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12]; + + [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; + } break; + case GGML_OP_UPSCALE: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const float sf0 = (float)ne0/src0->ne[0]; + const float sf1 = (float)ne1/src0->ne[1]; + const float sf2 = (float)ne2/src0->ne[2]; + const float sf3 = (float)ne3/src0->ne[3]; + + const id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18]; + [encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19]; + [encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20]; + [encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ARANGE: + { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float step; + + memcpy(&start, ((const int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&step, ((const int32_t *) dst->op_params) + 2, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:0]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1]; + [encoder setBytes:&start length:sizeof(start) atIndex:2]; + [encoder setBytes:&step length:sizeof(step) atIndex:3]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + const int half = dim / 2; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:3]; + [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4]; + + const int nth = MIN(1024, half); + + [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ARGSORT: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + // bitonic sort requires the number of elements to be power of 2 + int64_t ne00_padded = 1; + while (ne00_padded < ne00) { + ne00_padded *= 2; + } + + // Metal kernels require the buffer size to be multiple of 16 bytes + // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength + const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16); + + id pipeline = nil; + + switch (order) { + case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + default: GGML_ABORT("fatal error"); + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3]; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)]; + } break; + case GGML_OP_LEAKY_RELU: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + float slope; + memcpy(&slope, dst->op_params, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ne11 % 32 == 0); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_are_same_shape (src1, src2)); + + struct ggml_tensor * src3 = node->src[3]; + + size_t offs_src3 = 0; + + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; + + GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16); + GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && + "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); + + const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); + //const int64_t ne31 = src3 ? src3->ne[1] : 0; + const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); + const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); + + const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30); + const uint64_t nb31 = src3 ? src3->nb[1] : 0; + const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32); + const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33); + + const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); + + float scale; + float max_bias; + float logit_softcap; + memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias)); + memcpy(&logit_softcap, ((const int32_t *) dst->op_params) + 2, sizeof(logit_softcap)); + + if (logit_softcap != 0.0f) { + scale /= logit_softcap; + } + + const uint32_t n_head = src0->ne[2]; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + id pipeline = nil; + + bool use_vec_kernel = false; + + if (ne01 >= 4 || (ne00%128 != 0)) { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + default: + { + GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } else { + use_vec_kernel = true; + + switch (ne00) { + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + default: + { + GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + if (id_src3) { + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; + [encoder setBytes:&scale length:sizeof( float) atIndex:23]; + [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; + [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28]; + + if (!use_vec_kernel) { + // half8x8 kernel + const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 8 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + int64_t nsgmax = 2; + + while (true) { + const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); + if (smem > ctx->device.maxThreadgroupMemoryLength) { + break; + } + nsgmax *= 2; + } + nsgmax /= 2; + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; + + const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); + + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } else { + // half1x4 kernel + const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 1 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); + + int64_t nsg = 1; + while (nsg <= nsgt) { + nsg *= 2; + } + nsg /= 2; + + const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } + } break; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + { + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); + + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; + default: GGML_ABORT("not implemented"); + }; + } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; + default: GGML_ABORT("not implemented"); + }; + } break; + default: GGML_ABORT("not implemented"); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + default: + { + GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_ABORT("fatal error"); + } + } +} + +static enum ggml_status ggml_metal_graph_compute( + struct ggml_backend_metal_context * ctx, + struct ggml_cgraph * gf) { + // number of nodes encoded by the main thread (empirically determined) + const int n_main = 128; + + // number of threads in addition to the main thread + const int n_cb = ctx->n_cb; + + // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them + // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread + // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes + // each thread creates it's own command buffer and enqueues the ops in parallel + // + // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2 + + @autoreleasepool { + ctx->gf = gf; + + ctx->n_nodes_0 = MIN(n_main, gf->n_nodes); + ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0; + + ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb; + + const bool should_capture = ctx->capture_next_compute; + if (should_capture) { + ctx->capture_next_compute = false; + + if (!ctx->capture_started) { + // create capture scope + ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device]; + + MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; + descriptor.captureObject = ctx->capture_scope; + descriptor.destination = MTLCaptureDestinationGPUTraceDocument; + descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]]; + + NSError * error = nil; + if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { + GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); + GGML_ABORT("capture failed"); + } else { + [ctx->capture_scope beginScope]; + ctx->capture_started = true; + } } } - [encoder endEncoding]; + // TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes. + ctx->encode_async = ^(size_t iter) { + const int cb_idx = iter; + const int n_cb_l = ctx->n_cb; - if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; - } - }); + const int n_nodes_0 = ctx->n_nodes_0; + const int n_nodes_1 = ctx->n_nodes_1; - // Wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) + const int n_nodes_per_cb = ctx->n_nodes_per_cb; - for (int i = 0; i < n_cb; ++i) { - id command_buffer = command_buffers[i]; - [command_buffer waitUntilCompleted]; + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = [command_buffer computeCommandEncoderWithDescriptor: ctx->edesc]; - MTLCommandBufferStatus status = [command_buffer status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + int node_start = 0; + int node_end = n_nodes_0; + + if (cb_idx < n_cb_l) { + node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); + node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); } - return GGML_STATUS_FAILED; + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + ggml_metal_encode_node(ctx, idx, encoder); + + if (should_capture) { + [encoder popDebugGroup]; + } + } + + [encoder endEncoding]; + + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } + }; + + // the main thread commits the first few commands immediately + // command_buffer[n_cb] + { + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->command_buffers[n_cb] = command_buffer; + + [command_buffer enqueue]; + ctx->encode_async(n_cb); } - id next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); - if (!next_buffer) { - continue; + // prepare the rest of the command buffers asynchronously + // command_buffer[0.. n_cb) + for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->command_buffers[cb_idx] = command_buffer; + + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } } - bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; + dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); + + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id command_buffer = ctx->command_buffers[n_cb]; + [command_buffer waitUntilCompleted]; + + MTLCommandBufferStatus status = [command_buffer status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } } - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; + for (int i = 0; i < n_cb; ++i) { + id command_buffer = ctx->command_buffers[i]; + [command_buffer waitUntilCompleted]; + + MTLCommandBufferStatus status = [command_buffer status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + + id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } - [next_buffer commit]; + if (!should_capture && ctx->capture_started) { + [ctx->capture_scope endScope]; + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + } } - if (should_capture) { - [[MTLCaptureManager sharedCaptureManager] stopCapture]; - } - - } return GGML_STATUS_SUCCESS; } @@ -3405,6 +3504,25 @@ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, g UNUSED(backend); } +static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; + + if (ctx->n_cb != n_cb) { + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); + + if (ctx->n_cb > 2) { + GGML_METAL_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb); + } + } + + // TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why? + //ctx->encode_async = ^(size_t iter) { + // ... + //}; +} + static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, @@ -3439,35 +3557,29 @@ static ggml_guid_t ggml_backend_metal_guid(void) { } ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + struct ggml_backend_metal_context * ctx = ggml_metal_init(); if (ctx == NULL) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } - ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); - *metal_backend = (struct ggml_backend) { + *backend = (struct ggml_backend) { /* .guid = */ ggml_backend_metal_guid(), /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; - return metal_backend; + ggml_backend_metal_set_n_cb(backend, 1); + + return backend; } bool ggml_backend_is_metal(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid()); } -void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { - GGML_ASSERT(ggml_backend_is_metal(backend)); - - struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; - - ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); -} - void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { GGML_ASSERT(ggml_backend_is_metal(backend)); @@ -3489,7 +3601,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { GGML_ASSERT(ggml_backend_is_metal(backend)); struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; - ctx->should_capture_next_compute = true; + ctx->capture_next_compute = true; } GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning diff --git a/src/llama.cpp b/src/llama.cpp index d1d27d21e..4c0a1bb61 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17025,12 +17025,6 @@ static void llama_graph_compute( ggml_cgraph * gf, int n_threads, ggml_threadpool * threadpool) { -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend_metal)) { - ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); - } -#endif - if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); From 7254cdf7e8d6312840509ca8d766358c687c6266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 29 Sep 2024 23:18:02 +0200 Subject: [PATCH 19/40] ggml: fix gradient allocation logic (ggml/966) * ggml: fix gradient allocation logic * gradient allocation in ggml_build_backward_expand * fixup * fix test-backend-ops grad * suggestions by slaren * fix test1.c * fix legacy opt API * fix test-grad0 * remove keep arg --- ggml/include/ggml.h | 40 +- ggml/src/ggml.c | 1464 +++++++++++------------------------- tests/test-backend-ops.cpp | 37 +- tests/test-grad0.cpp | 14 +- 4 files changed, 490 insertions(+), 1065 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f46d4a8a6..a8a74bee1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -577,10 +577,10 @@ extern "C" { // this tensor... enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph - GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph - GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters - GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) }; // n-dimensional tensor @@ -1410,14 +1410,14 @@ extern "C" { // supports 3D: a->ne[2] == b->ne[1] GGML_API struct ggml_tensor * ggml_get_rows( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * a, // data + struct ggml_tensor * b); // row indices GGML_API struct ggml_tensor * ggml_get_rows_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c); + struct ggml_tensor * a, // gradients of ggml_get_rows result + struct ggml_tensor * b, // row indices + struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape GGML_API struct ggml_tensor * ggml_diag( struct ggml_context * ctx, @@ -1568,9 +1568,9 @@ extern "C" { // a - dy GGML_API struct ggml_tensor * ggml_rope_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, + struct ggml_tensor * a, // gradients of ggml_rope result + struct ggml_tensor * b, // positions + struct ggml_tensor * c, // freq factors int n_dims, int mode, int n_ctx_orig, @@ -2036,15 +2036,15 @@ extern "C" { // loss function GGML_API struct ggml_tensor * ggml_cross_entropy_loss( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b); // labels GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c); + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b, // labels + struct ggml_tensor * c); // gradients of cross_entropy_loss result // AdamW optimizer step // Paper: https://arxiv.org/pdf/1711.05101v3.pdf @@ -2066,7 +2066,7 @@ extern "C" { GGML_API void ggml_set_loss(struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate); GGML_API void ggml_build_opt_adamw( struct ggml_context * ctx, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 81b651c6a..fa8d6c25a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4725,18 +4725,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam static struct ggml_tensor * ggml_dup_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_DUP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DUP; result->src[0] = a; return result; @@ -4744,13 +4737,13 @@ static struct ggml_tensor * ggml_dup_impl( struct ggml_tensor * ggml_dup( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { return ggml_dup_impl(ctx, a, false); } struct ggml_tensor * ggml_dup_inplace( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { return ggml_dup_impl(ctx, a, true); } @@ -4758,21 +4751,14 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_add_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD; result->src[0] = a; result->src[1] = b; @@ -4781,15 +4767,15 @@ static struct ggml_tensor * ggml_add_impl( struct ggml_tensor * ggml_add( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add_impl(ctx, a, b, false); } struct ggml_tensor * ggml_add_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add_impl(ctx, a, b, true); } @@ -4797,9 +4783,9 @@ struct ggml_tensor * ggml_add_inplace( static struct ggml_tensor * ggml_add_cast_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - enum ggml_type type) { + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); GGML_ASSERT(ggml_can_repeat_rows(b, a)); @@ -4809,18 +4795,9 @@ static struct ggml_tensor * ggml_add_cast_impl( a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16); - bool is_node = false; - - if (a->grad || b->grad) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); - result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL; + result->op = GGML_OP_ADD; result->src[0] = a; result->src[1] = b; @@ -4829,9 +4806,9 @@ static struct ggml_tensor * ggml_add_cast_impl( struct ggml_tensor * ggml_add_cast( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - enum ggml_type type) { + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { return ggml_add_cast_impl(ctx, a, b, type); } @@ -4839,22 +4816,15 @@ struct ggml_tensor * ggml_add_cast( static struct ggml_tensor * ggml_add1_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_is_scalar(b)); GGML_ASSERT(ggml_is_padded_1d(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_ADD1; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD1; result->src[0] = a; result->src[1] = b; @@ -4863,15 +4833,15 @@ static struct ggml_tensor * ggml_add1_impl( struct ggml_tensor * ggml_add1( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add1_impl(ctx, a, b, false); } struct ggml_tensor * ggml_add1_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_add1_impl(ctx, a, b, true); } @@ -4879,31 +4849,24 @@ struct ggml_tensor * ggml_add1_inplace( static struct ggml_tensor * ggml_acc_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(a->type == GGML_TYPE_F32); GGML_ASSERT(b->type == GGML_TYPE_F32); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ACC; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ACC; result->src[0] = a; result->src[1] = b; @@ -4912,23 +4875,23 @@ static struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * ggml_acc( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } struct ggml_tensor * ggml_acc_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - size_t nb1, - size_t nb2, - size_t nb3, - size_t offset) { + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } @@ -4936,23 +4899,14 @@ struct ggml_tensor * ggml_acc_inplace( static struct ggml_tensor * ggml_sub_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SUB; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUB; result->src[0] = a; result->src[1] = b; @@ -4961,15 +4915,15 @@ static struct ggml_tensor * ggml_sub_impl( struct ggml_tensor * ggml_sub( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_sub_impl(ctx, a, b, false); } struct ggml_tensor * ggml_sub_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { return ggml_sub_impl(ctx, a, b, true); } @@ -4977,27 +4931,14 @@ struct ggml_tensor * ggml_sub_inplace( static struct ggml_tensor * ggml_mul_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - // TODO: support backward pass for broadcasting - GGML_ASSERT(ggml_are_same_shape(a, b)); - is_node = true; - } - - if (inplace) { - GGML_ASSERT(!is_node); - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_MUL; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL; result->src[0] = a; result->src[1] = b; @@ -5022,25 +4963,14 @@ struct ggml_tensor * ggml_mul_inplace( static struct ggml_tensor * ggml_div_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - - if (inplace) { - GGML_ASSERT(!is_node); - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_DIV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIV; result->src[0] = a; result->src[1] = b; @@ -5065,18 +4995,11 @@ struct ggml_tensor * ggml_div_inplace( static struct ggml_tensor * ggml_sqr_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SQR; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SQR; result->src[0] = a; return result; @@ -5098,18 +5021,11 @@ struct ggml_tensor * ggml_sqr_inplace( static struct ggml_tensor * ggml_sqrt_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + struct ggml_tensor * a, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SQRT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SQRT; result->src[0] = a; return result; @@ -5132,17 +5048,10 @@ struct ggml_tensor * ggml_sqrt_inplace( static struct ggml_tensor * ggml_log_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_LOG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_LOG; result->src[0] = a; return result; @@ -5165,17 +5074,10 @@ struct ggml_tensor * ggml_log_inplace( static struct ggml_tensor * ggml_sin_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SIN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SIN; result->src[0] = a; return result; @@ -5198,17 +5100,10 @@ struct ggml_tensor * ggml_sin_inplace( static struct ggml_tensor * ggml_cos_impl( struct ggml_context * ctx, struct ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_COS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_COS; result->src[0] = a; return result; @@ -5230,17 +5125,10 @@ struct ggml_tensor * ggml_cos_inplace( struct ggml_tensor * ggml_sum( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - + struct ggml_tensor * a) { struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = GGML_OP_SUM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUM; result->src[0] = a; return result; @@ -5250,13 +5138,7 @@ struct ggml_tensor * ggml_sum( struct ggml_tensor * ggml_sum_rows( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - + struct ggml_tensor * a) { int64_t ne[GGML_MAX_DIMS] = { 1 }; for (int i = 1; i < GGML_MAX_DIMS; ++i) { ne[i] = a->ne[i]; @@ -5264,8 +5146,7 @@ struct ggml_tensor * ggml_sum_rows( struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); - result->op = GGML_OP_SUM_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SUM_ROWS; result->src[0] = a; return result; @@ -5275,19 +5156,11 @@ struct ggml_tensor * ggml_sum_rows( struct ggml_tensor * ggml_mean( struct ggml_context * ctx, - struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - + struct ggml_tensor * a) { int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MEAN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MEAN; result->src[0] = a; return result; @@ -5297,19 +5170,12 @@ struct ggml_tensor * ggml_mean( struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, - struct ggml_tensor * a) { + struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); - is_node = true; - } struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); - result->op = GGML_OP_ARGMAX; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ARGMAX; result->src[0] = a; return result; @@ -5319,20 +5185,13 @@ struct ggml_tensor * ggml_argmax( struct ggml_tensor * ggml_repeat( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_can_repeat(a, b)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); - result->op = GGML_OP_REPEAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_REPEAT; result->src[0] = a; return result; @@ -5342,24 +5201,13 @@ struct ggml_tensor * ggml_repeat( struct ggml_tensor * ggml_repeat_back( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_can_repeat(b, a)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - if (ggml_are_same_shape(a, b) && !is_node) { - return a; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); - result->op = GGML_OP_REPEAT_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_REPEAT_BACK; result->src[0] = a; return result; @@ -5369,9 +5217,9 @@ struct ggml_tensor * ggml_repeat_back( struct ggml_tensor * ggml_concat( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int dim) { + struct ggml_tensor * a, + struct ggml_tensor * b, + int dim) { GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS); int64_t ne[GGML_MAX_DIMS]; @@ -5384,19 +5232,11 @@ struct ggml_tensor * ggml_concat( ne[d] = a->ne[d]; } - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); ggml_set_op_params_i32(result, 0, dim); - result->op = GGML_OP_CONCAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONCAT; result->src[0] = a; result->src[1] = b; @@ -5505,20 +5345,14 @@ struct ggml_tensor * ggml_relu_inplace( struct ggml_tensor * ggml_leaky_relu( struct ggml_context * ctx, - struct ggml_tensor * a, float negative_slope, bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: not implemented - is_node = true; - } - + struct ggml_tensor * a, + float negative_slope, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &negative_slope, sizeof(negative_slope)); - result->op = GGML_OP_LEAKY_RELU; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_LEAKY_RELU; result->src[0] = a; return result; @@ -5586,17 +5420,9 @@ struct ggml_tensor * ggml_silu_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - bool is_node = false; - - if (a->grad || b->grad) { - // TODO: implement backward - is_node = true; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SILU_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SILU_BACK; result->src[0] = a; result->src[1] = b; @@ -5604,6 +5430,7 @@ struct ggml_tensor * ggml_silu_back( } // ggml hardswish + struct ggml_tensor * ggml_hardswish( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5611,6 +5438,7 @@ struct ggml_tensor * ggml_hardswish( } // ggml hardsigmoid + struct ggml_tensor * ggml_hardsigmoid( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5618,6 +5446,7 @@ struct ggml_tensor * ggml_hardsigmoid( } // ggml exp + struct ggml_tensor * ggml_exp( struct ggml_context * ctx, struct ggml_tensor * a) { @@ -5635,21 +5464,13 @@ struct ggml_tensor * ggml_exp_inplace( static struct ggml_tensor * ggml_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, - float eps, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_NORM; result->src[0] = a; return result; @@ -5658,14 +5479,14 @@ static struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * ggml_norm( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_norm_impl(ctx, a, eps, false); } struct ggml_tensor * ggml_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_norm_impl(ctx, a, eps, true); } @@ -5674,20 +5495,13 @@ struct ggml_tensor * ggml_norm_inplace( static struct ggml_tensor * ggml_rms_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, - float eps, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_RMS_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RMS_NORM; result->src[0] = a; return result; @@ -5696,14 +5510,14 @@ static struct ggml_tensor * ggml_rms_norm_impl( struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_rms_norm_impl(ctx, a, eps, false); } struct ggml_tensor * ggml_rms_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - float eps) { + float eps) { return ggml_rms_norm_impl(ctx, a, eps, true); } @@ -5713,20 +5527,12 @@ struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - float eps) { - bool is_node = false; - - if (a->grad) { - // TODO: implement backward - is_node = true; - } - + float eps) { struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = GGML_OP_RMS_NORM_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RMS_NORM_BACK; result->src[0] = a; result->src[1] = b; @@ -5736,43 +5542,35 @@ struct ggml_tensor * ggml_rms_norm_back( // ggml_group_norm static struct ggml_tensor * ggml_group_norm_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps, - bool inplace) { - - bool is_node = false; - if (!inplace && (a->grad)) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, n_groups); ggml_set_op_params_f32(result, 1, eps); - result->op = GGML_OP_GROUP_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GROUP_NORM; result->src[0] = a; return result; } struct ggml_tensor * ggml_group_norm( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps) { return ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct ggml_tensor * ggml_group_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - int n_groups, - float eps) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps) { return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } @@ -5785,17 +5583,10 @@ struct ggml_tensor * ggml_mul_mat( GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MUL_MAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL_MAT; result->src[0] = a; result->src[1] = b; @@ -5841,17 +5632,10 @@ struct ggml_tensor * ggml_mul_mat_id( GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast - bool is_node = false; - - if (as->grad || b->grad) { - is_node = true; - } - const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_MUL_MAT_ID; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MUL_MAT_ID; result->src[0] = as; result->src[1] = b; result->src[2] = ids; @@ -5868,18 +5652,11 @@ struct ggml_tensor * ggml_out_prod( GGML_ASSERT(ggml_can_out_prod(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_OUT_PROD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_OUT_PROD; result->src[0] = a; result->src[1] = b; @@ -5892,21 +5669,14 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_context * ctx, struct ggml_tensor * a, float s, - bool inplace) { + bool inplace) { GGML_ASSERT(ggml_is_padded_1d(a)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, &s, sizeof(s)); - result->op = GGML_OP_SCALE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SCALE; result->src[0] = a; return result; @@ -5914,15 +5684,15 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_tensor * ggml_scale( struct ggml_context * ctx, - struct ggml_tensor * a, - float s) { + struct ggml_tensor * a, + float s) { return ggml_scale_impl(ctx, a, s, false); } struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - float s) { + struct ggml_tensor * a, + float s) { return ggml_scale_impl(ctx, a, s, true); } @@ -5936,15 +5706,9 @@ static struct ggml_tensor * ggml_set_impl( size_t nb2, size_t nb3, size_t offset, - bool inplace) { + bool inplace) { GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5952,8 +5716,7 @@ static struct ggml_tensor * ggml_set_impl( int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_SET; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SET; result->src[0] = a; result->src[1] = b; @@ -5962,8 +5725,8 @@ static struct ggml_tensor * ggml_set_impl( struct ggml_tensor * ggml_set( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, @@ -5973,8 +5736,8 @@ struct ggml_tensor * ggml_set( struct ggml_tensor * ggml_set_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, @@ -5984,24 +5747,24 @@ struct ggml_tensor * ggml_set_inplace( struct ggml_tensor * ggml_set_1d( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); } struct ggml_tensor * ggml_set_1d_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); } struct ggml_tensor * ggml_set_2d( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); @@ -6009,8 +5772,8 @@ struct ggml_tensor * ggml_set_2d( struct ggml_tensor * ggml_set_2d_inplace( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true); @@ -6024,13 +5787,6 @@ static struct ggml_tensor * ggml_cpy_impl( struct ggml_tensor * b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); - bool is_node = false; - - if (a->grad || b->grad) { - // inplace is false and either one have a grad - is_node = true; - } - // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); if (strlen(b->name) > 0) { @@ -6039,8 +5795,7 @@ static struct ggml_tensor * ggml_cpy_impl( ggml_format_name(result, "%s (copy)", a->name); } - result->op = GGML_OP_CPY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CPY; result->src[0] = a; result->src[1] = b; @@ -6058,15 +5813,11 @@ struct ggml_tensor * ggml_cast( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_type type) { - bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); ggml_format_name(result, "%s (copy)", a->name); - result->op = GGML_OP_CPY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CPY; result->src[0] = a; - result->src[1] = result; return result; } @@ -6076,17 +5827,10 @@ struct ggml_tensor * ggml_cast( static struct ggml_tensor * ggml_cont_impl( struct ggml_context * ctx, struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_format_name(result, "%s (cont)", a->name); - result->op = GGML_OP_CONT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONT; result->src[0] = a; return result; @@ -6132,13 +5876,10 @@ struct ggml_tensor * ggml_cont_4d( int64_t ne3) { GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3)); - bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); ggml_format_name(result, "%s (cont)", a->name); - result->op = GGML_OP_CONT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONT; result->src[0] = a; return result; @@ -6154,22 +5895,10 @@ struct ggml_tensor * ggml_reshape( // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous. GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - if (b->grad) { - // gradient propagation is not supported - //GGML_ABORT("fatal error"); - } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6182,18 +5911,11 @@ struct ggml_tensor * ggml_reshape_1d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6207,18 +5929,11 @@ struct ggml_tensor * ggml_reshape_2d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6233,18 +5948,11 @@ struct ggml_tensor * ggml_reshape_3d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6260,18 +5968,11 @@ struct ggml_tensor * ggml_reshape_4d( GGML_ASSERT(ggml_is_contiguous(a)); GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RESHAPE; result->src[0] = a; return result; @@ -6283,20 +5984,12 @@ static struct ggml_tensor * ggml_view_impl( int n_dims, const int64_t * ne, size_t offset) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_VIEW; result->src[0] = a; return result; @@ -6309,7 +6002,6 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * a, int64_t ne0, size_t offset) { - struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); return result; @@ -6324,7 +6016,6 @@ struct ggml_tensor * ggml_view_2d( int64_t ne1, size_t nb1, size_t offset) { - const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); @@ -6347,7 +6038,6 @@ struct ggml_tensor * ggml_view_3d( size_t nb1, size_t nb2, size_t offset) { - const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); @@ -6372,7 +6062,6 @@ struct ggml_tensor * ggml_view_4d( size_t nb2, size_t nb3, size_t offset) { - const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); @@ -6405,12 +6094,6 @@ struct ggml_tensor * ggml_permute( GGML_ASSERT(axis1 != axis3); GGML_ASSERT(axis2 != axis3); - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); ggml_format_name(result, "%s (permuted)", a->name); @@ -6437,8 +6120,7 @@ struct ggml_tensor * ggml_permute( result->nb[2] = nb[2]; result->nb[3] = nb[3]; - result->op = GGML_OP_PERMUTE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_PERMUTE; result->src[0] = a; int32_t params[] = { axis0, axis1, axis2, axis3 }; @@ -6452,12 +6134,6 @@ struct ggml_tensor * ggml_permute( struct ggml_tensor * ggml_transpose( struct ggml_context * ctx, struct ggml_tensor * a) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); ggml_format_name(result, "%s (transposed)", a->name); @@ -6467,8 +6143,7 @@ struct ggml_tensor * ggml_transpose( result->nb[0] = a->nb[1]; result->nb[1] = a->nb[0]; - result->op = GGML_OP_TRANSPOSE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_TRANSPOSE; result->src[0] = a; return result; @@ -6484,12 +6159,6 @@ struct ggml_tensor * ggml_get_rows( GGML_ASSERT(b->ne[3] == 1); GGML_ASSERT(b->type == GGML_TYPE_I32); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // TODO: implement non F32 return enum ggml_type type = GGML_TYPE_F32; if (a->type == GGML_TYPE_I32) { @@ -6497,8 +6166,7 @@ struct ggml_tensor * ggml_get_rows( } struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); - result->op = GGML_OP_GET_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_ROWS; result->src[0] = a; result->src[1] = b; @@ -6515,18 +6183,11 @@ struct ggml_tensor * ggml_get_rows_back( GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } - // TODO: implement non F32 return //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); - result->op = GGML_OP_GET_ROWS_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_ROWS_BACK; result->src[0] = a; result->src[1] = b; @@ -6539,17 +6200,11 @@ struct ggml_tensor * ggml_diag( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(a->ne[1] == 1); - bool is_node = false; - - if (a->grad) { - is_node = true; - } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne); - result->op = GGML_OP_DIAG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG; result->src[0] = a; return result; @@ -6562,19 +6217,12 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * a, int n_past, bool inplace) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_DIAG_MASK_INF; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG_MASK_INF; result->src[0] = a; return result; @@ -6601,19 +6249,12 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * a, int n_past, bool inplace) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_DIAG_MASK_ZERO; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_DIAG_MASK_ZERO; result->src[0] = a; return result; @@ -6656,19 +6297,12 @@ static struct ggml_tensor * ggml_soft_max_impl( GGML_ASSERT(mask); } - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); float params[] = { scale, max_bias }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_SOFT_MAX; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SOFT_MAX; result->src[0] = a; result->src[1] = mask; @@ -6703,16 +6337,9 @@ static struct ggml_tensor * ggml_soft_max_back_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; // TODO : implement backward pass - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = GGML_OP_SOFT_MAX_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SOFT_MAX_BACK; result->src[0] = a; result->src[1] = b; @@ -6761,12 +6388,6 @@ static struct ggml_tensor * ggml_rope_impl( GGML_ASSERT(c->ne[0] >= n_dims / 2); } - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig }; @@ -6778,8 +6399,7 @@ static struct ggml_tensor * ggml_rope_impl( memcpy(params + 10, &beta_slow, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ROPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ROPE; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -6907,13 +6527,6 @@ struct ggml_tensor * ggml_rope_back( GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); - bool is_node = false; - - if (a->grad) { - GGML_ASSERT(false && "backwards pass not implemented"); - is_node = false; - } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig }; @@ -6925,8 +6538,7 @@ struct ggml_tensor * ggml_rope_back( memcpy(params + 10, &beta_slow, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_ROPE_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ROPE_BACK; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -6941,21 +6553,13 @@ struct ggml_tensor * ggml_clamp( struct ggml_tensor * a, float min, float max) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // TODO: when implement backward, fix this: struct ggml_tensor * result = ggml_view_tensor(ctx, a); float params[] = { min, max }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CLAMP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CLAMP; result->src[0] = a; return result; @@ -7017,13 +6621,6 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( GGML_ASSERT(p0 == 0); GGML_ASSERT(d0 == 1); - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), a->ne[1], b->ne[2], 1, @@ -7033,8 +6630,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONV_TRANSPOSE_1D; result->src[0] = a; result->src[1] = b; @@ -7042,17 +6638,17 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( } // ggml_conv_depthwise -struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { +struct ggml_tensor * ggml_conv_depthwise_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), @@ -7072,29 +6668,23 @@ struct ggml_tensor * ggml_conv_depthwise_2d( // b: [N, IC, IH, IW] // result: [N, OH, OW, IC*KH*KW] struct ggml_tensor * ggml_im2col( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D, - enum ggml_type dst_type) { - + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D, + enum ggml_type dst_type) { if(is_2D) { GGML_ASSERT(a->ne[2] == b->ne[2]); } else { GGML_ASSERT(a->ne[1] == b->ne[1]); GGML_ASSERT(b->ne[3] == 1); } - bool is_node = false; - - if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data - is_node = true; - } const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); @@ -7113,8 +6703,7 @@ struct ggml_tensor * ggml_im2col( int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_IM2COL; result->src[0] = a; result->src[1] = b; @@ -7122,30 +6711,22 @@ struct ggml_tensor * ggml_im2col( } struct ggml_tensor * ggml_im2col_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int64_t * ne, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D) { - - bool is_node = false; - - if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t * ne, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D) { struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_IM2COL_BACK; result->src[0] = a; result->src[1] = b; @@ -7159,12 +6740,12 @@ struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW] struct ggml_tensor * result = @@ -7180,6 +6761,7 @@ struct ggml_tensor * ggml_conv_2d( } // ggml_conv_2d_sk_p0 + struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -7209,13 +6791,6 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( int stride) { GGML_ASSERT(a->ne[3] == b->ne[2]); - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/), ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/), @@ -7226,8 +6801,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( ggml_set_op_params_i32(result, 0, stride); - result->op = GGML_OP_CONV_TRANSPOSE_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CONV_TRANSPOSE_2D; result->src[0] = a; result->src[1] = b; @@ -7249,14 +6823,6 @@ struct ggml_tensor * ggml_pool_1d( int k0, int s0, int p0) { - - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], @@ -7268,8 +6834,7 @@ struct ggml_tensor * ggml_pool_1d( int32_t params[] = { op, k0, s0, p0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_1D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_1D; result->src[0] = a; return result; @@ -7287,13 +6852,6 @@ struct ggml_tensor * ggml_pool_2d( int s1, float p0, float p1) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result; const int64_t ne[4] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), @@ -7306,9 +6864,9 @@ struct ggml_tensor * ggml_pool_2d( int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_2D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_2D; result->src[0] = a; + return result; } @@ -7323,100 +6881,74 @@ struct ggml_tensor * ggml_pool_2d_back( int s1, float p0, float p1) { - - bool is_node = false; - - if (a->grad) { - is_node = true; - } - struct ggml_tensor * result; result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_POOL_2D_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_POOL_2D_BACK; result->src[0] = a; result->src[1] = af; + return result; } // ggml_upscale static struct ggml_tensor * ggml_upscale_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3) { GGML_ASSERT(a->ne[0] <= ne0); GGML_ASSERT(a->ne[1] <= ne1); GGML_ASSERT(a->ne[2] <= ne2); GGML_ASSERT(a->ne[3] <= ne3); - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, - ne0, - ne1, - ne2, - ne3 - ); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); - result->op = GGML_OP_UPSCALE; - - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_UPSCALE; result->src[0] = a; return result; } struct ggml_tensor * ggml_upscale( - struct ggml_context * ctx, - struct ggml_tensor * a, - int scale_factor) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]); } struct ggml_tensor * ggml_upscale_ext( - struct ggml_context * ctx, - struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3) { + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3) { return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3); } // ggml_pad struct ggml_tensor * ggml_pad( - struct ggml_context * ctx, - struct ggml_tensor * a, - int p0, int p1, int p2, int p3) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3) { struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0] + p0, a->ne[1] + p1, a->ne[2] + p2, a->ne[3] + p3); - result->op = GGML_OP_PAD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_PAD; result->src[0] = a; return result; @@ -7425,39 +6957,32 @@ struct ggml_tensor * ggml_pad( // ggml_arange struct ggml_tensor * ggml_arange( - struct ggml_context * ctx, - float start, - float stop, - float step) { - + struct ggml_context * ctx, + float start, + float stop, + float step) { GGML_ASSERT(stop > start); const int64_t steps = (int64_t) ceilf((stop - start) / step); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps); - result->op = GGML_OP_ARANGE; ggml_set_op_params_f32(result, 0, start); ggml_set_op_params_f32(result, 1, stop); ggml_set_op_params_f32(result, 2, step); + result->op = GGML_OP_ARANGE; + return result; } // ggml_timestep_embedding struct ggml_tensor * ggml_timestep_embedding( - struct ggml_context * ctx, - struct ggml_tensor * timesteps, - int dim, - int max_period) { - bool is_node = false; - - if (timesteps->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period) { int actual_dim = dim; if (dim % 2 != 0) { actual_dim = dim + 1; @@ -7465,11 +6990,10 @@ struct ggml_tensor * ggml_timestep_embedding( struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); - result->op = GGML_OP_TIMESTEP_EMBEDDING; ggml_set_op_params_i32(result, 0, dim); ggml_set_op_params_i32(result, 1, max_period); - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_TIMESTEP_EMBEDDING; result->src[0] = timesteps; return result; @@ -7478,22 +7002,14 @@ struct ggml_tensor * ggml_timestep_embedding( // ggml_argsort struct ggml_tensor * ggml_argsort( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_sort_order order) { - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: not implemented - is_node = true; - } - + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_sort_order order) { struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); - result->op = GGML_OP_ARGSORT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ARGSORT; result->src[0] = a; return result; @@ -7546,10 +7062,6 @@ struct ggml_tensor * ggml_flash_attn_ext( bool is_node = false; - if (q->grad || k->grad || v->grad) { - is_node = true; - } - // permute(0, 2, 1, 3) int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); @@ -7676,17 +7188,9 @@ struct ggml_tensor * ggml_ssm_conv( GGML_ASSERT(sx->ne[1] == d_inner); GGML_ASSERT(n_t >= 0); - bool is_node = false; - - if (sx->grad || c->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s); - result->op = GGML_OP_SSM_CONV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_SSM_CONV; result->src[0] = sx; result->src[1] = c; @@ -7730,18 +7234,10 @@ struct ggml_tensor * ggml_ssm_scan( GGML_ASSERT(B->ne[2] == n_seqs); } - bool is_node = false; - - if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) { - GGML_ABORT("fatal error"); // TODO: implement - is_node = true; - } - // concatenated y + ssm_states struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s)); result->op = GGML_OP_SSM_SCAN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = s; result->src[1] = x; result->src[2] = dt; @@ -7761,13 +7257,6 @@ struct ggml_tensor * ggml_win_part( GGML_ASSERT(a->ne[3] == 1); GGML_ASSERT(a->type == GGML_TYPE_F32); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // padding const int px = (w - a->ne[1]%w)%w; const int py = (w - a->ne[2]%w)%w; @@ -7782,8 +7271,7 @@ struct ggml_tensor * ggml_win_part( int32_t params[] = { npx, npy, w }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_WIN_PART; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_WIN_PART; result->src[0] = a; return result; @@ -7799,21 +7287,13 @@ struct ggml_tensor * ggml_win_unpart( int w) { GGML_ASSERT(a->type == GGML_TYPE_F32); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { w }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_WIN_UNPART; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_WIN_UNPART; result->src[0] = a; return result; @@ -7829,18 +7309,10 @@ struct ggml_tensor * ggml_get_rel_pos( GGML_ASSERT(qh == kh); GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]); - bool is_node = false; - - if (a->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - const int64_t ne[4] = { a->ne[0], kh, qh, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne); - result->op = GGML_OP_GET_REL_POS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_GET_REL_POS; result->src[0] = a; return result; @@ -7864,17 +7336,10 @@ static struct ggml_tensor * ggml_add_rel_pos_impl( GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]); GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]); - bool is_node = false; - - if (!inplace && (a->grad || pw->grad || ph->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, inplace ? 1 : 0); - result->op = GGML_OP_ADD_REL_POS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_ADD_REL_POS; result->src[0] = a; result->src[1] = pw; result->src[2] = ph; @@ -7902,12 +7367,12 @@ struct ggml_tensor * ggml_add_rel_pos_inplace( struct ggml_tensor * ggml_rwkv_wkv( struct ggml_context * ctx, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * r, - struct ggml_tensor * tf, - struct ggml_tensor * td, - struct ggml_tensor * state) { + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state) { GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); GGML_ASSERT(ggml_is_contiguous(r)); @@ -7928,19 +7393,11 @@ struct ggml_tensor * ggml_rwkv_wkv( GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); } - bool is_node = false; - - if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) { - GGML_ABORT("fatal error"); // TODO: implement backward - is_node = true; - } - // concat output and new_state const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_RWKV_WKV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_RWKV_WKV; result->src[0] = k; result->src[1] = v; result->src[2] = r; @@ -7955,23 +7412,16 @@ struct ggml_tensor * ggml_rwkv_wkv( static struct ggml_tensor * ggml_unary_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_unary_op op, - bool inplace) { + struct ggml_tensor * a, + enum ggml_unary_op op, + bool inplace) { GGML_ASSERT(ggml_is_contiguous_1(a)); - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params_i32(result, 0, (int32_t) op); - result->op = GGML_OP_UNARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_UNARY; result->src[0] = a; return result; @@ -7980,14 +7430,14 @@ static struct ggml_tensor * ggml_unary_impl( struct ggml_tensor * ggml_unary( struct ggml_context * ctx, struct ggml_tensor * a, - enum ggml_unary_op op) { + enum ggml_unary_op op) { return ggml_unary_impl(ctx, a, op, false); } struct ggml_tensor * ggml_unary_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - enum ggml_unary_op op) { + enum ggml_unary_op op) { return ggml_unary_impl(ctx, a, op, true); } @@ -7996,20 +7446,13 @@ struct ggml_tensor * ggml_unary_inplace( static struct ggml_tensor * ggml_map_unary_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - + const ggml_unary_op_f32_t fun, + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_UNARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_UNARY; result->src[0] = a; return result; @@ -8018,14 +7461,14 @@ static struct ggml_tensor * ggml_map_unary_impl_f32( struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { + const ggml_unary_op_f32_t fun) { return ggml_map_unary_impl_f32(ctx, a, fun, false); } struct ggml_tensor * ggml_map_unary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { + const ggml_unary_op_f32_t fun) { return ggml_map_unary_impl_f32(ctx, a, fun, true); } @@ -8035,22 +7478,15 @@ static struct ggml_tensor * ggml_map_binary_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun, - bool inplace) { + const ggml_binary_op_f32_t fun, + bool inplace) { GGML_ASSERT(ggml_are_same_shape(a, b)); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_BINARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_BINARY; result->src[0] = a; result->src[1] = b; @@ -8061,7 +7497,7 @@ struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { + const ggml_binary_op_f32_t fun) { return ggml_map_binary_impl_f32(ctx, a, b, fun, false); } @@ -8069,7 +7505,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { + const ggml_binary_op_f32_t fun) { return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } @@ -8079,19 +7515,12 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32( struct ggml_context * ctx, struct ggml_tensor * a, const ggml_custom1_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM1_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM1_F32; result->src[0] = a; return result; @@ -8118,19 +7547,12 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32( struct ggml_tensor * a, struct ggml_tensor * b, const ggml_custom2_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM2_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM2_F32; result->src[0] = a; result->src[1] = b; @@ -8161,19 +7583,12 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32( struct ggml_tensor * b, struct ggml_tensor * c, const ggml_custom3_op_f32_t fun, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad || b->grad || c->grad)) { - is_node = true; - } - + bool inplace) { struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM3_F32; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM3_F32; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8201,26 +7616,20 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( // ggml_map_custom1 struct ggml_map_custom1_op_params { - ggml_custom1_op_t fun; - int n_tasks; - void * userdata; + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; }; static struct ggml_tensor * ggml_map_custom1_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && a->grad) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom1_op_params params = { @@ -8230,55 +7639,48 @@ static struct ggml_tensor * ggml_map_custom1_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM1; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM1; result->src[0] = a; return result; } struct ggml_tensor * ggml_map_custom1( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom1_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_custom1_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); } // ggml_map_custom2 struct ggml_map_custom2_op_params { - ggml_custom2_op_t fun; - int n_tasks; - void * userdata; + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; }; static struct ggml_tensor * ggml_map_custom2_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && (a->grad || b->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom2_op_params params = { @@ -8288,8 +7690,7 @@ static struct ggml_tensor * ggml_map_custom2_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM2; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM2; result->src[0] = a; result->src[1] = b; @@ -8297,22 +7698,22 @@ static struct ggml_tensor * ggml_map_custom2_impl( } struct ggml_tensor * ggml_map_custom2( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom2_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_custom2_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); } @@ -8325,22 +7726,16 @@ struct ggml_map_custom3_op_params { }; static struct ggml_tensor * ggml_map_custom3_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata, - bool inplace) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - bool is_node = false; - - if (!inplace && (a->grad || b->grad || c->grad)) { - is_node = true; - } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_map_custom3_op_params params = { @@ -8350,8 +7745,7 @@ static struct ggml_tensor * ggml_map_custom3_impl( }; ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = GGML_OP_MAP_CUSTOM3; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_MAP_CUSTOM3; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8360,44 +7754,38 @@ static struct ggml_tensor * ggml_map_custom3_impl( } struct ggml_tensor * ggml_map_custom3( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); } struct ggml_tensor * ggml_map_custom3_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c, - const ggml_custom3_op_t fun, - int n_tasks, - void * userdata) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { GGML_ASSERT(ggml_are_same_shape(a, b)); - bool is_node = false; - - if (a->grad || b->grad) { - is_node = true; - } struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = GGML_OP_CROSS_ENTROPY_LOSS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_OP_CROSS_ENTROPY_LOSS; result->src[0] = a; result->src[1] = b; @@ -8407,17 +7795,16 @@ struct ggml_tensor * ggml_cross_entropy_loss( // ggml_cross_entropy_loss_back struct ggml_tensor * ggml_cross_entropy_loss_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c) { + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { GGML_ASSERT(ggml_are_same_shape(a, b)); GGML_ASSERT(ggml_is_scalar(c)); struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; - result->grad = NULL; + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->src[0] = a; result->src[1] = b; result->src[2] = c; @@ -8435,7 +7822,7 @@ struct ggml_tensor * ggml_opt_step_adamw( float beta2, float eps, float wd) { - GGML_ASSERT(a->grad); + GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM); GGML_ASSERT(alpha > 0.0f); GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f); GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f); @@ -8444,13 +7831,6 @@ struct ggml_tensor * ggml_opt_step_adamw( struct ggml_tensor * result = ggml_view_tensor(ctx, a); - result->op = GGML_OP_OPT_STEP_ADAMW; - result->grad = NULL; - result->src[0] = a; - result->src[1] = a->grad; - result->src[2] = ggml_dup_tensor(ctx, a->grad); - result->src[3] = ggml_dup_tensor(ctx, a->grad); - const int64_t iter = 1; memcpy(&result->op_params[0], &iter, sizeof(int64_t)); ggml_set_op_params_f32(result, 2, alpha); @@ -8459,26 +7839,17 @@ struct ggml_tensor * ggml_opt_step_adamw( ggml_set_op_params_f32(result, 5, eps); ggml_set_op_params_f32(result, 6, wd); + result->op = GGML_OP_OPT_STEP_ADAMW; + result->src[0] = a; + result->src[1] = a->grad; + result->src[2] = ggml_dup_tensor(ctx, a); + result->src[3] = ggml_dup_tensor(ctx, a); + return result; } //////////////////////////////////////////////////////////////////////////////// -void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { - tensor->flags |= GGML_TENSOR_FLAG_PARAM; - - GGML_ASSERT(tensor->grad == NULL); - tensor->grad = ggml_dup_tensor(ctx, tensor); - ggml_format_name(tensor->grad, "%s (grad)", tensor->name); -} - -void ggml_set_loss(struct ggml_tensor * tensor) { - GGML_ASSERT(ggml_is_scalar(tensor)); - GGML_ASSERT(tensor->type == GGML_TYPE_F32); - GGML_ASSERT(tensor->grad); - tensor->flags |= GGML_TENSOR_FLAG_LOSS; -} - // ggml_compute_forward_dup static void ggml_compute_forward_dup_same_cont( @@ -18198,7 +17569,7 @@ void ggml_build_backward_gradient_checkpointing( struct ggml_tensor * * checkpoints, int n_checkpoints) { ggml_graph_cpy(gf, gb_tmp); - ggml_build_backward_expand(ctx, gf, gb_tmp, false, true); + ggml_build_backward_expand(ctx, gf, gb_tmp, false); if (n_checkpoints <= 0) { ggml_graph_cpy(gb_tmp, gb); @@ -18850,7 +18221,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_soft_max_back(ctx, tensor->grad, tensor), zero_table, acc_table); } - + GGML_ASSERT((!src1 || !src1->grad) && "backward pass for softmax mask not implemented"); } break; case GGML_OP_SOFT_MAX_BACK: { @@ -18891,6 +18262,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor beta_slow), zero_table, acc_table); } + GGML_ASSERT((!src2 || !src2->grad) && "gradients for freq factors not implemented"); } break; case GGML_OP_ROPE_BACK: { @@ -19012,6 +18384,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } case GGML_OP_FLASH_ATTN_EXT: { + GGML_ABORT("FA backward pass not adapted after rework"); struct ggml_tensor * flash_grad = NULL; if (src0->grad || src1->grad || tensor->src[2]->grad) { int32_t t = ggml_get_op_params_i32(tensor, 0); @@ -19186,6 +18559,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor tensor->grad), zero_table, acc_table); } + GGML_ASSERT(!src1->grad && "backward pass for labels not implemented"); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { @@ -19236,7 +18610,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } - if (node->op == GGML_OP_NONE && node->grad == NULL) { + if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) { // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < cgraph->size); @@ -19254,9 +18628,6 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } cgraph->nodes[cgraph->n_nodes] = node; - if (cgraph->grads) { - cgraph->grads[cgraph->n_nodes] = node->grad; - } cgraph->n_nodes++; } } @@ -19284,20 +18655,58 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * ggml_build_forward_impl(cgraph, tensor, true); } -void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep) { +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate) { GGML_ASSERT(gf->n_nodes > 0); GGML_ASSERT(gf->grads); - // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph - if (keep) { - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < gf->n_nodes; ++i) { + struct ggml_tensor * node = gf->nodes[i]; - if (node->grad) { - node->grad = ggml_dup_tensor(ctx, node); - gf->grads[i] = node->grad; - } + bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; + bool ignore_src[GGML_MAX_SRC] = {false}; + switch (node->op) { + // gradients in node->src[0] for one reason or another have no effect on output gradients + case GGML_OP_IM2COL: // only used for its shape + case GGML_OP_IM2COL_BACK: // same as IM2COL + ignore_src[0] = true; + break; + case GGML_OP_UNARY: { + const enum ggml_unary_op uop = ggml_get_unary_op(node); + // SGN and STEP unary ops are piecewise constant + if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) { + ignore_src[0] = true; + } + } break; + + // gradients in node->src[1] for one reason or another have no effect on output gradients + case GGML_OP_CPY: // gradients in CPY target are irrelevant + case GGML_OP_GET_ROWS: // row indices not differentiable + case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS + case GGML_OP_ROPE: // positions not differentiable + ignore_src[1] = true; + break; + + default: + break; } + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (!node->src[j] || !node->src[j]->grad || ignore_src[j]) { + continue; + } + GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16); + needs_grad = true; + break; + } + if (!needs_grad) { + continue; + } + + // inplace operations are currently not supported + GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || + node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); + + // create a new tensor with the same type and shape as the node and set it as grad + node->grad = ggml_dup_tensor(ctx, node); } // keep tables of original gradients for replacement/accumulation logic @@ -22162,8 +21571,6 @@ enum ggml_opt_result ggml_opt( struct ggml_context * ctx, struct ggml_opt_params params, struct ggml_tensor * f) { - GGML_ASSERT(f->grad && "ggml_set_param called for at least one parent tensor."); - bool free_ctx = false; if (ctx == NULL) { struct ggml_init_params params_ctx = { @@ -22204,7 +21611,7 @@ enum ggml_opt_result ggml_opt_resume( ggml_build_forward_expand(gf, f); struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf); - ggml_build_backward_expand(ctx, gf, gb, false, true); + ggml_build_backward_expand(ctx, gf, gb, false); return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } @@ -22257,6 +21664,17 @@ void ggml_set_output(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_OUTPUT; } +void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { + GGML_UNUSED(ctx); // TODO: remove this parameter + tensor->flags |= GGML_TENSOR_FLAG_PARAM; +} + +void ggml_set_loss(struct ggml_tensor * tensor) { + GGML_ASSERT(ggml_is_scalar(tensor)); + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + tensor->flags |= GGML_TENSOR_FLAG_LOSS; +} + //////////////////////////////////////////////////////////////////////////////// void ggml_quantize_init(enum ggml_type type) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d2cfe06b5..5c78b6704 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1,6 +1,6 @@ // This file defines tests for various GGML ops and backends. // For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent. -// For the backwards pass it asserts that the gradients from backpropagation are consistent +// For the backward pass it asserts that the gradients from backpropagation are consistent // with the gradients obtained via the method of finite differences ("grad" mode, this is optional). // It is also possible to check the performance ("perf" mode). // @@ -740,7 +740,7 @@ struct test_case { ggml_tensor * out = build_graph(ctx); - if (op_name != nullptr && op_desc(out) != op_name) { + if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) { //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; @@ -749,11 +749,6 @@ struct test_case { printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); fflush(stdout); - if (out->grad == nullptr) { - printf("backwards pass not supported \n"); - ggml_free(ctx); - return true; - } if (out->type != GGML_TYPE_F32) { ggml_free(ctx); printf("not supported [%s->type != FP32]\n", out->name); @@ -762,18 +757,26 @@ struct test_case { // check if the backend supports the ops bool supported = true; + bool any_params = false; for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { printf("not supported [%s] ", ggml_backend_name(backend)); supported = false; break; } - if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) { - printf("not supported [%s->type != FP32] ", t->name); - supported = false; - break; + if ((t->flags & GGML_TENSOR_FLAG_PARAM)) { + any_params = true; + if (t->type != GGML_TYPE_F32) { + printf("not supported [%s->type != FP32] ", t->name); + supported = false; + break; + } } } + if (!any_params) { + printf("not supported [%s] \n", op_name); + supported = false; + } if (!supported) { printf("\n"); ggml_free(ctx); @@ -801,7 +804,7 @@ struct test_case { ggml_build_forward_expand(gf, out); ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, false, false); + ggml_build_backward_expand(ctx, gf, gb, false); if (expect.size() != 1 || expect[0] != 0.0f) { GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { @@ -984,7 +987,7 @@ struct test_example : public test_case { } // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a) // immediately after you create the tensors. - // This is optional and only makes sense if a backwards pass has actually been implemented for the new op. + // This is optional and only makes sense if a backward pass has actually been implemented for the new op. }; @@ -1223,7 +1226,7 @@ struct test_set : public test_case { offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i]; } ggml_tensor * out = ggml_set(ctx, dst, src, - // The backwards pass requires setting a contiguous region: + // The backward pass requires setting a contiguous region: src->nb[1], src->nb[2], src->nb[3], offset); ggml_set_name(out, "out"); @@ -1335,7 +1338,7 @@ struct test_bin_bcast : public test_case { ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_name(b, "b"); - // The backwards pass supports broadcasting only for GGML_ADD: + // The backward pass supports broadcasting only for GGML_ADD: const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b); if (grad_supported) { ggml_set_param(ctx, a); @@ -1830,7 +1833,7 @@ struct test_log : public test_case { void initialize_tensors(ggml_context * ctx) override { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - // log(1) == 0, cluster values there to keep the sum low for better precision in the backwards pass: + // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass: init_tensor_uniform(t, 0.9f, 1.1f); } } @@ -3257,7 +3260,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1 + for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1})); diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 2ef606d2c..2200ad93d 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -240,12 +240,14 @@ static bool check_gradient( struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true); ggml_build_forward_expand(gf, f); ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx0, gf, gb, false, false); + ggml_build_backward_expand(ctx0, gf, gb, false); ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); + ggml_graph_reset(gb); + if (f->grad) { + ggml_set_f32(f->grad, 1.0f); + } ggml_graph_compute_with_ctx(ctx0, gb, n_threads); @@ -298,8 +300,10 @@ static bool check_gradient( ggml_set_f32_1d(x[i], k, x0); // compute gradient using backward graph - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); + ggml_graph_reset(gb); + if (f->grad) { + ggml_set_f32(f->grad, 1.0f); + } ggml_graph_compute_with_ctx(ctx0, gb, n_threads); From 6c5322481a75f1be54e58a000c3f78484d07f948 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Mon, 30 Sep 2024 10:11:41 +0300 Subject: [PATCH 20/40] ggml : fix ggml_cast (ggml/973) --- ggml/src/ggml.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index fa8d6c25a..aac4e3a7b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5818,6 +5818,7 @@ struct ggml_tensor * ggml_cast( result->op = GGML_OP_CPY; result->src[0] = a; + result->src[1] = result; return result; } From cb00020504416606601f8cb35f55ee07b710b4b7 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 30 Sep 2024 09:14:09 +0200 Subject: [PATCH 21/40] vulkan : mul_mat: fix UB with small warps (ggml/952) When the device's warp size is less than 16, it is possible for loadstride_a (mul_mm.comp:114) and loadstride_b (mul_mm.comp:115) to be set to 0. Because they are calculated as: the workgroup size, multiplied by LOAD_VEC_* (which can be 1) and divided by 16. And the workgroup size is set to be the same as the warp/subgroup size. The loadstride_* variables are used as increments in the loops that populate the buffers used for the multiplication. When they are 0 they cause an infinite loop. But infinite loops without side-effects are UB and the values of loadstride_* are known at compile time. So, the compiler quietly optimizes all the loops away. As a consequence, the buffers are not populated and the multiplication result is just a matrix with all elements set to 0. We prevent the UB by making sure that the workgroup size will never be less than 16, even if our device has a smaller warp size (e.g. 8). Signed-off-by: Salvatore Mesoraca --- ggml/src/ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index c677a2728..00ad13bb9 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1164,11 +1164,11 @@ static void ggml_vk_load_shaders(vk_device& device) { // mulmat std::initializer_list warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - std::initializer_list warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + std::initializer_list warptile_s = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; std::initializer_list warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - std::initializer_list warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + std::initializer_list warptile_mmq_s = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; std::array l_wg_denoms = {128, 128, 1 }; std::array m_wg_denoms = { 64, 64, 1 }; From e98c1c188ebbeb55d0cd6389ad03f0cbf995b451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 30 Sep 2024 09:55:23 +0200 Subject: [PATCH 22/40] test: fix OPT_STEP_ADAMW for test-backend-ops (ggml/974) --- ggml/include/ggml.h | 1 + ggml/src/ggml.c | 10 ++++++---- tests/test-backend-ops.cpp | 5 ++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a8a74bee1..ce3d92cb2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2052,6 +2052,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_opt_step_adamw( struct ggml_context * ctx, struct ggml_tensor * a, + struct ggml_tensor * grad, float alpha, float beta1, float beta2, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index aac4e3a7b..bcbc32d91 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7818,12 +7818,14 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( struct ggml_tensor * ggml_opt_step_adamw( struct ggml_context * ctx, struct ggml_tensor * a, + struct ggml_tensor * grad, float alpha, float beta1, float beta2, float eps, float wd) { GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM); + GGML_ASSERT(ggml_are_same_shape(a, grad)); GGML_ASSERT(alpha > 0.0f); GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f); GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f); @@ -7842,9 +7844,9 @@ struct ggml_tensor * ggml_opt_step_adamw( result->op = GGML_OP_OPT_STEP_ADAMW; result->src[0] = a; - result->src[1] = a->grad; - result->src[2] = ggml_dup_tensor(ctx, a); - result->src[3] = ggml_dup_tensor(ctx, a); + result->src[1] = grad; + result->src[2] = ggml_dup_tensor(ctx, grad); + result->src[3] = ggml_dup_tensor(ctx, grad); return result; } @@ -18769,7 +18771,7 @@ void ggml_build_opt_adamw( if (node->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, alpha, beta1, beta2, eps, wd); + struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, node->grad, alpha, beta1, beta2, eps, wd); ggml_build_forward_expand(gb, opt_step); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5c78b6704..95d983aa0 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2751,7 +2751,10 @@ struct test_opt_step_adamw : public test_case { ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not. ggml_set_name(a, "a"); - ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd); + ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_name(grad, "grad"); + + ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd); ggml_set_name(out, "out"); return out; From f1b8c4271125c2bfb9cfebd72a8b9e9f99061a30 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Oct 2024 16:09:42 +0300 Subject: [PATCH 23/40] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index aa301462a..23c24899e 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -9a24b8c8c40eab7262d067e91d08df160678df8d +4de6ee8e6a4b2145d6b92162bc87722fecb4ea46 From 3f1ae2e32cde00c39b96be6d01c2997c29bae555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Wodnicki?= <151604+32bitmicro@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:18:46 -0500 Subject: [PATCH 24/40] Update README.md (#9591) Add Bielik model. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ecc2df8ca..c56c97231 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ Typically finetunes of the base models below are supported as well. - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) +- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) From 148844fe97fff4c1563a3111bf238ba4dd22ef56 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 2 Oct 2024 10:14:44 +0300 Subject: [PATCH 25/40] examples : remove benchmark (#9704) ggml-ci --- Makefile | 13 +- examples/CMakeLists.txt | 1 - examples/benchmark/CMakeLists.txt | 6 - examples/benchmark/benchmark-matmult.cpp | 275 ----------------------- 4 files changed, 1 insertion(+), 294 deletions(-) delete mode 100644 examples/benchmark/CMakeLists.txt delete mode 100644 examples/benchmark/benchmark-matmult.cpp diff --git a/Makefile b/Makefile index 8a903d7ed..11b11f5de 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,6 @@ BUILD_TARGETS = \ llama-batched \ llama-batched-bench \ llama-bench \ - llama-benchmark-matmult \ llama-cli \ llama-convert-llama2c-to-ggml \ llama-embedding \ @@ -68,7 +67,7 @@ TEST_TARGETS = \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. @@ -1523,16 +1522,6 @@ common/build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ - $(OBJ_GGML) common/build-info.o - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -run-benchmark-matmult: llama-benchmark-matmult - ./$@ - -.PHONY: run-benchmark-matmult swift - tests/test-arg-parser: tests/test-arg-parser.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67b3d2774..ead630661 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,7 +16,6 @@ else() add_subdirectory(baby-llama) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt deleted file mode 100644 index 34a58cc02..000000000 --- a/examples/benchmark/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-bench-matmult) -add_executable(${TARGET} benchmark-matmult.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp deleted file mode 100644 index 922daf528..000000000 --- a/examples/benchmark/benchmark-matmult.cpp +++ /dev/null @@ -1,275 +0,0 @@ -#include "common.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - -static float tensor_sum_elements(const ggml_tensor * tensor) { - double sum = 0; - if (tensor->type == GGML_TYPE_F32) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; - } - } - } - return sum; -} - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - float sum = tensor_sum_elements(tensor); - printf("Sum of tensor %s is %6.2f\n", name, sum); -} - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -struct benchmark_params_struct { - int n_threads = 1; - int32_t n_iterations = 10; -}; - -static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); - fprintf(stderr, "\n"); -} - -int main(int argc, char ** argv) { - struct benchmark_params_struct benchmark_params; - - bool invalid_param = false; - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_threads = std::stoi(argv[i]); - } else if (arg == "-i" || arg == "--iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_iterations = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv, benchmark_params); - exit(0); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - print_usage(argc, argv, benchmark_params); - exit(1); - } - - print_build_info(); - printf("Starting Test\n"); - - // create the ggml context - struct ggml_context * ctx; - //const int sizex = 4096; - //const int sizey = 11008; - -#undef VERBOSE_DEBUGGING -#ifndef VERBOSE_DEBUGGING - const int sizey = 4096; - const int sizex = 11008; - const int sizez = 128; -#else - /* Working - let's increase size */ - const int sizey = 1; - const int sizex = (8*32); - const int sizez = 1; - - /*const int sizey = 1; - const int sizex = 3*(8*32); - const int sizez = 1;*/ -#endif - - //printf("Memsize required = %i\n", sizex*sizex); - - // TODO: perform the bench for all types or for a user specified type - const ggml_type qtype = GGML_TYPE_Q4_1; - - size_t ctx_size = 0; - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += 1024*1024*16; - - printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); - - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; - - ctx = ggml_init(params); - if (!ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return 1; - } - - - printf("Creating new tensors\n"); - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m11, 1.0f); - - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m12, 1.5f); - - // printf("Creating new tensor m2\n"); - struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); - ggml_set_f32(m2, 2.0f); - - printf("\n------ Test 1 - Matrix Mult via F32 code\n"); - // printf("Creating new tensor m11xm2\n"); - struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, m11xm2); - - printf("n_threads=%i\n", benchmark_params.n_threads); - - TENSOR_DUMP(m11); - TENSOR_DUMP(m2); - - std::vector work_buffer; - - ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - - TENSOR_DUMP(ggml_graph_node(gf, 0)); - - printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); - - int32_t nelements = sizex*sizey; - - // Set up a the benchmark matrices - // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); - - // Set up a the compute graph - // printf("Creating new tensor q31\n"); - struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf31 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf31, q31); - - // Set up a second graph computation to make sure we override the CPU cache lines - // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); - - // printf("Creating new tensor q32\n"); - struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); - - //printf("Creating compute graph\n"); - struct ggml_cgraph * gf32 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf32, q32); - printf("n_threads=%i\n", benchmark_params.n_threads); - - const int dimx = sizex; - const int dimy = sizey; - const int dimz = sizez; - long long int flops_per_dot_product = dimy + dimy; - long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; - printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - - - // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); - - printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); - printf("=====================================================================================\n"); - - double gflops_sum = 0; - for (int i=0;i allowed_delta) { - printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", - sum_of_F32_reference, - sum_of_Q4_result, - delta, - allowed_delta - ); - exit(0); - } - - // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); - } - printf("\n"); - printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); - printf("=====================================================================================\n"); -} From 76b37d1541a880b9645557c8715a343fd074cc5c Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:21:57 +0800 Subject: [PATCH 26/40] gguf-split : improve --split and --merge logic (#9619) * make sure params --split and --merge are not specified at same time * update gguf-split params parse logic * Update examples/gguf-split/gguf-split.cpp Co-authored-by: slaren --------- Co-authored-by: Xuan Son Nguyen Co-authored-by: slaren --- examples/gguf-split/gguf-split.cpp | 86 +++++++++++++++++------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 82c239b83..7e62657e1 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -22,12 +22,20 @@ #endif enum split_operation : uint8_t { - SPLIT_OP_SPLIT, - SPLIT_OP_MERGE, + OP_NONE, + OP_SPLIT, + OP_MERGE, +}; + +enum split_mode : uint8_t { + MODE_NONE, + MODE_TENSOR, + MODE_SIZE, }; struct split_params { - split_operation operation = SPLIT_OP_SPLIT; + split_operation operation = OP_NONE; + split_mode mode = MODE_NONE; size_t n_bytes_split = 0; int n_split_tensors = 128; std::string input; @@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } bool arg_found = false; - bool is_op_set = false; - bool is_mode_set = false; if (arg == "-h" || arg == "--help") { split_print_usage(argv[0]); exit(0); - } - if (arg == "--version") { + } else if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } - if (arg == "--dry-run") { + } else if (arg == "--dry-run") { arg_found = true; params.dry_run = true; - } - if (arg == "--no-tensor-first-split") { + } else if (arg == "--no-tensor-first-split") { arg_found = true; params.no_tensor_first_split = true; - } - - if (is_op_set) { - throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); - } - if (arg == "--merge") { + } else if (arg == "--merge") { arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_MERGE; - } - if (arg == "--split") { + if (params.operation != OP_NONE && params.operation != OP_MERGE) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_MERGE; + } else if (arg == "--split") { arg_found = true; - is_op_set = true; - params.operation = SPLIT_OP_SPLIT; - } - - if (is_mode_set) { - throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); - } - if (arg == "--split-max-tensors") { + if (params.operation != OP_NONE && params.operation != OP_SPLIT) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_SPLIT; + } else if (arg == "--split-max-tensors") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - is_mode_set = true; + if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_TENSOR; params.n_split_tensors = atoi(argv[arg_idx]); - } - if (arg == "--split-max-size") { + } else if (arg == "--split-max-size") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; - is_mode_set = true; + if (params.mode != MODE_NONE && params.mode != MODE_SIZE) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_SIZE; params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); } @@ -148,6 +149,15 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } } + // the operation is split if not specified + if (params.operation == OP_NONE) { + params.operation = OP_SPLIT; + } + // the split mode is by tensor if not specified + if (params.mode == MODE_NONE) { + params.mode = MODE_TENSOR; + } + if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } @@ -265,13 +275,15 @@ struct split_strategy { } bool should_split(int i_tensor, size_t next_size) { - if (params.n_bytes_split > 0) { + if (params.mode == MODE_SIZE) { // split by max size per file return next_size > params.n_bytes_split; - } else { + } else if (params.mode == MODE_TENSOR) { // split by number of tensors per file return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; } + // should never happen + GGML_ABORT("invalid mode"); } void print_info() { @@ -559,9 +571,9 @@ int main(int argc, const char ** argv) { split_params_parse(argc, argv, params); switch (params.operation) { - case SPLIT_OP_SPLIT: gguf_split(params); + case OP_SPLIT: gguf_split(params); break; - case SPLIT_OP_MERGE: gguf_merge(params); + case OP_MERGE: gguf_merge(params); break; default: split_print_usage(argv[0]); exit(EXIT_FAILURE); From 00b7317e636ffdc7dae59cb51dbd1cda357a3168 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Wed, 2 Oct 2024 13:49:16 +0300 Subject: [PATCH 27/40] vulkan : do not use tensor->extra (#9407) * vulkan : do not use tensor->extra This patch allows using the Vulkan backend with the RPC backend as tensor->extra is no longer used. Ref: #8536 * Adapt GGML_VULKAN_CHECK_RESULTS to extra removal (#2) --------- Co-authored-by: 0cc4m --- ggml/src/ggml-vulkan.cpp | 342 +++++++++++++++++---------------------- 1 file changed, 148 insertions(+), 194 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 00ad13bb9..789b24bdf 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -433,16 +433,6 @@ struct vk_context_struct { typedef std::shared_ptr vk_context; typedef std::weak_ptr vk_context_ref; -struct ggml_tensor_extra_gpu { - vk_buffer_ref buffer_gpu; - uint64_t offset; - - void reset() { - buffer_gpu.reset(); - offset = 0; - } -}; - struct ggml_vk_garbage_collector { std::vector tl_semaphores; std::vector semaphores; @@ -553,6 +543,31 @@ struct ggml_backend_vk_context { std::vector tensor_ctxs; }; +static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; +} + +struct ggml_backend_vk_buffer_context { + vk_device_ref device; + vk_buffer dev_buffer; + std::string name; + + ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : + device(device), + dev_buffer(dev_buffer), + name(name) { + } + + ~ggml_backend_vk_buffer_context() { + ggml_vk_destroy_buffer(dev_buffer); + } +}; + #ifdef GGML_VULKAN_MEMORY_DEBUG void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { std::lock_guard guard(log_mutex); @@ -3076,9 +3091,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3180,8 +3195,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); vk_buffer d_X; @@ -3189,13 +3204,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3276,9 +3291,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3357,21 +3372,21 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if(!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3454,9 +3469,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c GGML_ASSERT(ne11 == 1); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qy; size_t qy_buf_offset = 0; @@ -3482,15 +3497,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); - vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; + vk_buffer d_Qx = src0_buf_ctx->dev_buffer; + const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3532,9 +3547,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(ne11 == 1); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; @@ -3561,15 +3576,15 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); - vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; + vk_buffer d_Qx = src0_buf_ctx->dev_buffer; + const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3631,10 +3646,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t n_as = ne02; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3731,26 +3746,26 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (!ids_uma) { - d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset + ids->view_offs; + d_ids = ids_buf_ctx->dev_buffer; + ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -3836,10 +3851,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t ne22 = dst->ne[2]; const uint64_t ne23 = dst->ne[3]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; vk_buffer d_Qx; size_t qx_buf_offset = 0; @@ -3924,26 +3939,26 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte return; } - vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset + dst->view_offs; + vk_buffer d_D = dst_buf_ctx->dev_buffer; + const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if(!src0_uma) { - d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset + src0->view_offs; + d_Qx = src0_buf_ctx->dev_buffer; + qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { - d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset + src1->view_offs; + d_Qy = src1_buf_ctx->dev_buffer; + qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if(!ids_uma) { - d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset + ids->view_offs; + d_ids = ids_buf_ctx->dev_buffer; + ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -4250,7 +4265,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT - GGML_ASSERT(dst->extra != nullptr); + GGML_ASSERT(dst->buffer != nullptr); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; @@ -4296,10 +4311,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr; + ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr; vk_buffer d_X = nullptr; size_t x_buf_offset = 0; @@ -4330,7 +4345,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; uint64_t d_sz = ggml_type_size(dst->type) * ned; - vk_buffer d_D = extra->buffer_gpu.lock(); + vk_buffer d_D = dst_buf_ctx->dev_buffer; // Workaround for tiny tensor inputs on ROPE if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { @@ -4338,21 +4353,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; - GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT + uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; + GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT if(!src0_uma) { - d_X = extra_src0->buffer_gpu.lock(); - x_buf_offset = extra_src0->offset + src0->view_offs; + d_X = src0_buf_ctx->dev_buffer; + x_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_X != nullptr); } if (use_src1 && !src1_uma) { - d_Y = extra_src1->buffer_gpu.lock(); - y_buf_offset = extra_src1->offset + src1->view_offs; + d_Y = src1_buf_ctx->dev_buffer; + y_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Y != nullptr); } if (use_src2 && !src2_uma) { - d_Z = extra_src2->buffer_gpu.lock(); - z_buf_offset = extra_src2->offset + src2->view_offs; + d_Z = src2_buf_ctx->dev_buffer; + z_buf_offset = vk_tensor_offset(src2) + src2->view_offs; GGML_ASSERT(d_Z != nullptr); } @@ -4531,11 +4546,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, } static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 @@ -4724,10 +4738,9 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), @@ -5535,14 +5548,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } #endif -static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { - VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))"); - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - extra->reset(); - tensor->extra = extra; - return extra; -} - static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); @@ -5711,9 +5716,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; - - if (ggml_is_empty(node) || extra == nullptr) { + if (ggml_is_empty(node) || !node->buffer) { return false; } @@ -5965,7 +5968,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ - ggml_tensor_extra_gpu * extra = nullptr; + ggml_backend_buffer * buf = nullptr; switch (tensor->op) { case GGML_OP_ADD: @@ -6001,7 +6004,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: case GGML_OP_REPEAT: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; case GGML_OP_UNARY: @@ -6011,7 +6014,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; default: return false; @@ -6019,14 +6022,14 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - extra = (ggml_tensor_extra_gpu *) tensor->extra; + buf = tensor->buffer; break; default: return false; } - if (extra == nullptr) { + if (buf == nullptr) { return false; } @@ -6167,42 +6170,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript // device backend -static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT - -struct ggml_backend_vk_buffer_context { - vk_device_ref device; - vk_buffer dev_buffer; - ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; - size_t temp_tensor_extra_index = 0; - std::string name; - - ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : - device(device), - dev_buffer(dev_buffer), - name(name) { - } - - ~ggml_backend_vk_buffer_context() { - ggml_vk_destroy_buffer(dev_buffer); - if (temp_tensor_extras != nullptr) { - delete[] temp_tensor_extras; - } - } - - ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { - if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES]; - } - - size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES; - ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; - extra->reset(); - - return extra; - } -}; - GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; return ctx->name.c_str(); @@ -6227,51 +6194,37 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); - ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - GGML_ASSERT(tensor->view_src->extra != nullptr); - tensor->extra = tensor->view_src->extra; - } else { - ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); - extra->buffer_gpu = ctx->dev_buffer; - extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; - tensor->extra = extra; } } GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; + vk_buffer buf = buf_ctx->dev_buffer; - vk_buffer buf = extra->buffer_gpu.lock(); - - ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size); - - GGML_UNUSED(buffer); + ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read(buf, extra->offset + tensor->view_offs + offset, data, size); - - GGML_UNUSED(buffer); + ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_vk(src->buffer)) { - ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - vk_buffer src_buf = src_extra->buffer_gpu.lock(); - vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + vk_buffer src_buf = src_buf_ctx->dev_buffer; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); return true; } @@ -6449,7 +6402,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context transfer_ctx; @@ -6462,9 +6415,9 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); + ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6472,7 +6425,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context transfer_ctx; @@ -6485,17 +6438,17 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer buf = extra->buffer_gpu.lock(); + vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); + ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { - ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; vk_context transfer_ctx; @@ -6508,10 +6461,10 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c transfer_ctx = ctx->transfer_ctx.lock(); } - vk_buffer src_buf = src_extra->buffer_gpu.lock(); - vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + vk_buffer src_buf = src_buf_ctx->dev_buffer; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); return true; } @@ -6949,10 +6902,10 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); + vk_buffer buffer_gpu = buf_ctx->dev_buffer; + ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -7026,9 +6979,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src0->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src0->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src0) + src0->view_offs; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { @@ -7068,9 +7021,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src1->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src1->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src1) + src1->view_offs; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { @@ -7110,9 +7063,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { memcpy(src2_clone->data, src2->data, src2_size); memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(src2->buffer)) { - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset + src2->view_offs; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src2) + src2->view_offs; if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { for (int i3 = 0; i3 < src2->ne[3]; i3++) { for (int i2 = 0; i2 < src2->ne[2]; i2++) { @@ -7167,7 +7120,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_PAD) { tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); } else if (tensor->op == GGML_OP_REPEAT) { - tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); + tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_ACC) { @@ -7312,14 +7265,15 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) { - tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs); + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; + if (offset + tensor_size >= buffer_gpu->size) { + tensor_size = buffer_gpu->size - offset; } - ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); + ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); } float first_error_result = -1.0f; From f536f4c4391bec74c432a924625c04e8c484d3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Wed, 2 Oct 2024 13:57:18 +0100 Subject: [PATCH 28/40] [SYCL] Initial cmake support of SYCL for AMD GPUs (#9658) sycl: initial cmake support of SYCL for AMD GPUs --- docs/backend/SYCL.md | 92 ++++++++++++++++++++++++++++++++--------- ggml/src/CMakeLists.txt | 19 ++++++++- 2 files changed, 90 insertions(+), 21 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index bc266f7d8..ea34182e4 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -26,7 +26,7 @@ ### Llama.cpp + SYCL -The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*). +The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. ## Recommended Release @@ -111,10 +111,18 @@ SYCL backend supports Intel GPU Family: **Verified devices** -| Nvidia GPU | Status | Verified Model | -|--------------------------|---------|----------------| -| Ampere Series | Support | A100, A4000 | -| Ampere Series *(Mobile)* | Support | RTX 40 Series | +| Nvidia GPU | Status | Verified Model | +|--------------------------|-----------|----------------| +| Ampere Series | Supported | A100, A4000 | +| Ampere Series *(Mobile)* | Supported | RTX 40 Series | + +| AMD GPU | Status | Verified Model | +|--------------------------|--------------|----------------| +| Radeon Pro | Experimental | W6800 | +| Radeon RX | Experimental | 6700 XT | + +Note: AMD GPU support is highly experimental and is incompatible with F16. +Additionally, it only supports GPUs with a sub_group_size (warp size) of 32. ## Docker The docker build option is currently limited to *intel GPU* targets. @@ -186,6 +194,10 @@ Platform #0: Intel(R) OpenCL HD Graphics In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. +- **AMD GPU** + +To target AMD GPUs with SYCL, the ROCm stack must be installed first. + 2. **Install Intel® oneAPI Base toolkit** - **For Intel GPU** @@ -212,6 +224,19 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` +- **Adding support to AMD GPUs** + +**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. + +**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs. + +```sh +git clone https://github.com/oneapi-src/oneMKL +cd oneMKL +# Find your HIPTARGET with rocminfo, under the key 'Name:' +cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas +cmake --build buildWithrocBLAS --config Release +``` 3. **Verify installation and environment** @@ -223,22 +248,32 @@ sycl-ls - **Intel GPU** -When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below: +When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below: ``` -[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] -[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] -[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] -[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] +[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] ``` - **Nvidia GPU** -Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: +Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below: + ``` -[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] -[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] -[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2] +[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] +[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] +[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5] +``` + +- **AMD GPU** + +For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]: + +``` +[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000] +[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9] ``` ### II. Build llama.cpp @@ -266,6 +301,7 @@ cmake --build build --config Release -j -v ``` #### Nvidia GPU + ```sh # Export relevant ENV variables export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH @@ -283,7 +319,25 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx - # build all binary cmake --build build --config Release -j -v +``` +#### AMD GPU + +```sh +# Export relevant ENV variables +export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR + +# Build LLAMA with rocBLAS acceleration through SYCL + +## AMD +# Use FP32, FP16 is not supported +# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:' +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +# build all binary +cmake --build build --config Release -j -v ``` ### III. Run the inference @@ -586,11 +640,11 @@ use 1 SYCL GPUs: [0] with Max compute units:512 #### Build -| Name | Value | Function | -|--------------------|-----------------------------------|---------------------------------------------| -| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| -| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | -| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| Name | Value | Function | +|--------------------|---------------------------------------|---------------------------------------------| +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| +| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | +| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbc349500..2982f00ec 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -511,8 +511,8 @@ if (GGML_HIPBLAS) endif() if (GGML_SYCL) - if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") endif() check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) @@ -532,6 +532,9 @@ if (GGML_SYCL) list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL) if (GGML_SYCL_F16) + if (GGML_SYCL_TARGET STREQUAL "AMD") + message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.") + endif() add_compile_definitions(GGML_SYCL_F16) endif() @@ -543,6 +546,12 @@ if (GGML_SYCL) if (GGML_SYCL_TARGET STREQUAL "NVIDIA") add_compile_definitions(GGML_SYCL_WARP_SIZE=32) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + # INFO: Allowed Sub_group_sizes are not consistent through all + # hip targets. For example, 64 is used for certain models, but the backend + # does not support it. + # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32) + add_compile_definitions(GGML_SYCL_WARP_SIZE=32) else() add_compile_definitions(GGML_SYCL_WARP_SIZE=16) endif() @@ -576,6 +585,12 @@ if (GGML_SYCL) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + if (GGML_SYCL_HIP_TARGET STREQUAL "") + message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}") + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) endif() endif() endif() From a39ab216aa624308fda7fa84439c6b61dc98b87a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 2 Oct 2024 15:49:55 +0200 Subject: [PATCH 29/40] llama : reduce compile time and binary size (#9712) * llama : speed up compile time * fix build * fix build (2) --- src/llama.cpp | 22 +++++++++++----------- src/unicode-data.cpp | 10 ++++++---- src/unicode-data.h | 8 ++++---- src/unicode.cpp | 21 ++++++++++++++------- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4c0a1bb61..af19e5c86 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -610,7 +610,7 @@ enum llm_tensor { LLM_TENSOR_CLS_OUT, }; -static const std::map> LLM_TENSOR_NAMES = { +static const std::map> LLM_TENSOR_NAMES = { { LLM_ARCH_LLAMA, { @@ -1566,32 +1566,32 @@ struct LLM_TN { return LLM_TENSOR_NAMES.at(arch).at(tensor); } - std::string operator()(llm_tensor tensor, const std::string & suffix) const { + std::string operator()(llm_tensor tensor, const char * suffix) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix; + return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid); + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid); } - std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + std::string operator()(llm_tensor tensor, const char * suffix, int bid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix; } - std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix; } }; @@ -4916,7 +4916,7 @@ struct llama_model_loader { static const int TENSOR_NOT_REQUIRED = 1; static const int TENSOR_DUPLICATED = 2; - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, int flags = 0) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list & ne, int flags = 0) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); if (cur == NULL) { @@ -4926,7 +4926,7 @@ struct llama_model_loader { return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); } - struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector & ne, size_t offset, bool required = true) { + struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required = true) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); if (cur == NULL) { @@ -4939,7 +4939,7 @@ struct llama_model_loader { std::array dims; for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { - dims[i] = i < ne.size() ? ne[i] : 1; + dims[i] = i < ne.size() ? ne.begin()[i] : 1; } struct ggml_tensor * tensor = ggml_view_4d(ctx, base, diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 02bdf7823..07424bbab 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -7,7 +7,7 @@ #include #include -const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1 +const std::initializer_list> unicode_ranges_flags = { // start, flags // last=next_start-1 {0x000000, 0x0080}, {0x000020, 0x0008}, {0x000021, 0x0020}, @@ -2311,7 +2311,8 @@ const std::unordered_set unicode_set_whitespace = { 0x003000, }; -const std::unordered_map unicode_map_lowercase = { +// list is always in ascending order, to enable binary searh +const std::initializer_list> unicode_map_lowercase = { {0x000041, 0x000061}, {0x000042, 0x000062}, {0x000043, 0x000063}, @@ -3747,7 +3748,8 @@ const std::unordered_map unicode_map_lowercase = { {0x01E921, 0x01E943}, }; -const std::unordered_map unicode_map_uppercase = { +// list is always in ascending order, to enable binary searh +const std::initializer_list> unicode_map_uppercase = { {0x000061, 0x000041}, {0x000062, 0x000042}, {0x000063, 0x000043}, @@ -5200,7 +5202,7 @@ const std::unordered_map unicode_map_uppercase = { {0x01E943, 0x01E921}, }; -const std::vector unicode_ranges_nfd = { // start, last, nfd +const std::initializer_list unicode_ranges_nfd = { // start, last, nfd {0x000000, 0x000000, 0x000000}, {0x0000C0, 0x0000C5, 0x000041}, {0x0000C7, 0x0000C7, 0x000043}, diff --git a/src/unicode-data.h b/src/unicode-data.h index e27fe1770..f6973ebd2 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -13,8 +13,8 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; -extern const std::vector> unicode_ranges_flags; +extern const std::initializer_list> unicode_ranges_flags; extern const std::unordered_set unicode_set_whitespace; -extern const std::unordered_map unicode_map_lowercase; -extern const std::unordered_map unicode_map_uppercase; -extern const std::vector unicode_ranges_nfd; +extern const std::initializer_list> unicode_map_lowercase; +extern const std::initializer_list> unicode_map_uppercase; +extern const std::initializer_list unicode_ranges_nfd; diff --git a/src/unicode.cpp b/src/unicode.cpp index f4e941cd1..50b35bbbc 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { static std::vector unicode_cpt_flags_array() { std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); - assert (unicode_ranges_flags.front().first == 0); - assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); + assert (unicode_ranges_flags.begin()[0].first == 0); + assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS); for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { - const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags - const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags + const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags + const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { cpt_flags[cpt] = range_ini.second; } @@ -597,7 +597,7 @@ std::vector unicode_cpts_normalize_nfd(const std::vector & c std::vector result(cpts.size()); for (size_t i = 0; i < cpts.size(); ++i) { const uint32_t cpt = cpts[i]; - auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1; + auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1; result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt; } return result; @@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) { } uint32_t unicode_tolower(uint32_t cp) { - auto it = unicode_map_lowercase.find(cp); - return it == unicode_map_lowercase.end() ? cp : it->second; + // binary search + auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp, + [](const std::pair & pair, uint32_t value) { + return pair.first < value; + }); + if (it != unicode_map_lowercase.end() && it->first == cp) { + return it->second; + } + return cp; // Return the original code point if no lowercase mapping is found } std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { From c83ad6d01e7b89ec71080d97c7e5db7ac1f4fda6 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 3 Oct 2024 01:49:47 +0200 Subject: [PATCH 30/40] ggml-backend : add device and backend reg interfaces (#9707) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- .github/workflows/bench.yml.disabled | 4 +- Makefile | 5 +- Package.swift | 2 +- ggml/include/ggml-backend.h | 203 +++-- ggml/include/ggml-blas.h | 6 +- ggml/include/ggml-cann.h | 18 +- ggml/include/ggml-cuda.h | 32 +- ggml/include/ggml-metal.h | 6 +- ggml/include/ggml-rpc.h | 10 +- ggml/include/ggml-sycl.h | 16 +- ggml/include/ggml-vulkan.h | 14 +- ggml/include/ggml.h | 68 +- ggml/src/CMakeLists.txt | 2 +- ggml/src/ggml-backend-impl.h | 226 +++-- ggml/src/{ggml-backend.c => ggml-backend.cpp} | 788 +++++++++++------- ggml/src/ggml-blas.cpp | 18 +- ggml/src/ggml-cann.cpp | 155 ++-- ggml/src/ggml-cuda.cu | 556 +++++++----- ggml/src/ggml-kompute.cpp | 25 +- ggml/src/ggml-metal.m | 49 +- ggml/src/ggml-rpc.cpp | 53 +- ggml/src/ggml-sycl.cpp | 107 +-- ggml/src/ggml-vulkan.cpp | 117 ++- ggml/src/ggml.c | 46 +- scripts/sync-ggml-am.sh | 4 +- scripts/sync-ggml.sh | 2 +- src/llama.cpp | 555 ++++++------ tests/test-backend-ops.cpp | 25 +- 28 files changed, 1809 insertions(+), 1303 deletions(-) rename ggml/src/{ggml-backend.c => ggml-backend.cpp} (77%) diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index bfdbb4ef5..1c8787ef7 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -27,10 +27,10 @@ on: push: branches: - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' diff --git a/Makefile b/Makefile index 11b11f5de..2793978c3 100644 --- a/Makefile +++ b/Makefile @@ -1054,10 +1054,11 @@ ggml/src/ggml-alloc.o: \ $(CC) $(CFLAGS) -c $< -o $@ ggml/src/ggml-backend.o: \ - ggml/src/ggml-backend.c \ + ggml/src/ggml-backend.cpp \ + ggml/src/ggml-backend-impl.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h - $(CC) $(CFLAGS) -c $< -o $@ + $(CXX) $(CXXFLAGS) -c $< -o $@ ggml/src/ggml-quants.o: \ ggml/src/ggml-quants.c \ diff --git a/Package.swift b/Package.swift index 1d90b47bf..3a17e6c34 100644 --- a/Package.swift +++ b/Package.swift @@ -11,7 +11,7 @@ var sources = [ "src/unicode-data.cpp", "ggml/src/ggml.c", "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.c", + "ggml/src/ggml-backend.cpp", "ggml/src/ggml-quants.c", "ggml/src/ggml-aarch64.c", ] diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 71c0bef8e..b096aaed6 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -12,43 +12,52 @@ extern "C" { typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend * ggml_backend_t; typedef void * ggml_backend_graph_plan_t; + typedef struct ggml_backend_reg * ggml_backend_reg_t; + typedef struct ggml_backend_device * ggml_backend_dev_t; + + + // + // Backend buffer type + // + + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); // // Backend buffer // - // buffer type - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); - GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); - - // buffer enum ggml_backend_buffer_usage { GGML_BACKEND_BUFFER_USAGE_ANY = 0, GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, }; - GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); // - // Backend + // Backend (stream) // GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); @@ -64,9 +73,9 @@ extern "C" { GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // "offset" refers to the offset of the tensor data for setting/getting data - GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -76,65 +85,121 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // NOTE: will be removed, use device version instead GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); - // tensor copy between different backends - GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - // asynchronous copy // the copy is performed after all the currently queued operations in backend_src // backend_dst will wait for the copy to complete before performing other operations // automatic fallback to sync copy if async is not supported GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); - // events - GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend); - GGML_API void ggml_backend_event_free (ggml_backend_event_t event); - GGML_API void ggml_backend_event_record (ggml_backend_event_t event); - GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); + GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); // - // CPU backend + // Events // - GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); + GGML_API void ggml_backend_event_free(ggml_backend_event_t event); + GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); + GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); + GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); - GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); - GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + // + // Backend device + // - // Create a backend buffer from an existing pointer - GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + enum ggml_backend_dev_type { + GGML_BACKEND_DEVICE_TYPE_CPU, + GGML_BACKEND_DEVICE_TYPE_GPU, + // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication) + GGML_BACKEND_DEVICE_TYPE_CPU_FULL, + GGML_BACKEND_DEVICE_TYPE_GPU_FULL + }; - GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + // functionality supported by the device + struct ggml_backend_dev_caps { + // asynchronous operations + bool async; + // pinned host buffer + bool host_buffer; + // event synchronization + bool events; + }; -#ifdef GGML_USE_CPU_HBM - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); -#endif + // all the device properties + struct ggml_backend_dev_props { + const char * name; + const char * description; + size_t memory_free; + size_t memory_total; + enum ggml_backend_dev_type type; + struct ggml_backend_dev_caps caps; + }; + + GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); + GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); + GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); + GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); + GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); + + GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + + // + // Backend (reg) + // + + GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg); + GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); + GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); + GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); + GGML_API void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data); + + // Functions that may be obtained using ggml_backend_reg_get_proc_address + typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *); // // Backend registry // - // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way + // Backend (reg) enumeration + GGML_API size_t ggml_backend_reg_count(void); + GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); + GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); - GGML_API size_t ggml_backend_reg_get_count(void); - GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found - GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) - GGML_API const char * ggml_backend_reg_get_name(size_t i); - GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific - GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); - GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size); + // Device enumeration + GGML_API size_t ggml_backend_dev_count(void); + GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); + + // Set the log callback for all registered backends + GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data); + + // Direct backend (stream) initialization + // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) + GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) + GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) + GGML_API ggml_backend_t ggml_backend_init_best(void); // // Backend scheduler // - // The backend scheduler allows for multiple backends to be used together + // The backend scheduler allows for multiple backend devices to be used together // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends // The backends are selected based on: // - the backend that supports the operation @@ -169,9 +234,9 @@ extern "C" { } */ - struct ggml_backend_sched; typedef struct ggml_backend_sched * ggml_backend_sched_t; + // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) // when ask == true, the scheduler wants to know if the user wants to observe this node // this allows the scheduler to batch nodes together in order to evaluate them in a single call // @@ -226,7 +291,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); @@ -235,6 +300,26 @@ extern "C" { GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); + // + // CPU backend + // + + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + + // Create a backend buffer from an existing pointer + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + + GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif #ifdef __cplusplus } diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h index f2e37de06..dd612860d 100644 --- a/ggml/include/ggml-blas.h +++ b/ggml/include/ggml-blas.h @@ -9,13 +9,13 @@ extern "C" { #endif // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); +GGML_API ggml_backend_t ggml_backend_blas_init(void); -GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); +GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); // number of threads used for conversion to float // for openblas and blis, this will also set the number of threads used for blas operations -GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); +GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); #ifdef __cplusplus diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index 031ad1ce2..ba9ff2292 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -44,7 +44,7 @@ extern "C" { * @param device The index of the device to initialize. * @return A pointer to the initialized backend instance, or nullptr on failure. */ -GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); +GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device); /** * @brief Checks if a given backend is a CANN backend. @@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); * @param backend The backend instance to check. * @return True if the backend is a CANN backend, false otherwise. */ -GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); +GGML_API bool ggml_backend_is_cann(ggml_backend_t backend); /** * @brief Retrieves the CANN buffer type for a specified device. @@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -GGML_API GGML_CALL ggml_backend_buffer_type_t +GGML_API ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device); /** @@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device); * * @return The number of CANN devices available. */ -GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); +GGML_API int32_t ggml_backend_cann_get_device_count(void); /** * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. * * @return A pointer to the host buffer type interface. */ -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); /** * @brief Retrieves the description of a specific CANN device. @@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type * @param description Pointer to a buffer where the description will be written. * @param description_size Size of the description buffer. */ -GGML_API GGML_CALL void ggml_backend_cann_get_device_description( +GGML_API void ggml_backend_cann_get_device_description( int32_t device, char* description, size_t description_size); /** @@ -112,9 +112,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description( * @param total Pointer to a variable where the total memory size will be * stored. */ -GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, - size_t* free, - size_t* total); +GGML_API void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); /** * @brief Set the logging callback for GGML. diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 71bb6dcf0..a8feddc94 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -3,6 +3,10 @@ #include "ggml.h" #include "ggml-backend.h" +#ifdef __cplusplus +extern "C" { +#endif + #ifdef GGML_USE_HIPBLAS #define GGML_CUDA_NAME "ROCm" #define GGML_CUBLAS_NAME "hipBLAS" @@ -13,35 +17,33 @@ #define GGML_CUDA_NAME "CUDA" #define GGML_CUBLAS_NAME "cuBLAS" #endif - -#ifdef __cplusplus -extern "C" { -#endif - #define GGML_CUDA_MAX_DEVICES 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); +GGML_API ggml_backend_t ggml_backend_cuda_init(int device); -GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); // device buffer -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); -GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); -GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API int ggml_backend_cuda_get_device_count(void); +GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); -GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); +GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); + +GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void); + #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index 4d4165324..55e6ecd84 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -1,3 +1,5 @@ +// Note: this description is outdated +// // An interface allowing to compute ggml_cgraph with Metal // // This is a fully functional interface that extends ggml with GPU support for Apple devices. @@ -43,11 +45,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); +GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family // ideally, the user code should be doing these checks diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index aa144832a..64cde7f13 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -10,14 +10,14 @@ extern "C" { #define GGML_RPC_MAX_SERVERS 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); -GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); +GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); +GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); +GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); +GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); -GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); +GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); #ifdef __cplusplus } diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 43ab1519c..03b698e61 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); +GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); -GGML_API void ggml_backend_sycl_print_sycl_devices(void); -GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); -GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); -GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); -GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); +GGML_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len); +GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size); +GGML_API int ggml_backend_sycl_get_device_count(); +GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); // SYCL doesn't support registering host memory, keep here for reference -// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); -// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer); +// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index af661c2d7..e074042ef 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -13,16 +13,16 @@ extern "C" { GGML_API void ggml_vk_instance_init(void); // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); +GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); -GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); -GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); -GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); -GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); +GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_API int ggml_backend_vk_get_device_count(void); +GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); +GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index ce3d92cb2..969be3e94 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -187,16 +187,6 @@ # define GGML_API #endif -#ifdef GGML_MULTIPLATFORM -# if defined(_WIN32) -# define GGML_CALL -# else -# define GGML_CALL __attribute__((__ms_abi__)) -# endif -#else -# define GGML_CALL -#endif - // TODO: support for clang #ifdef __GNUC__ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) @@ -340,7 +330,7 @@ extern "C" { }; // get ggml_status name string - GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status); + GGML_API const char * ggml_status_to_string(enum ggml_status status); // ieee 754-2008 half-precision float16 // todo: make this not an integral type @@ -716,46 +706,46 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); - GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type); - GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + GGML_API int64_t ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row GGML_DEPRECATED( GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float "use ggml_row_size() instead"); - GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type); - GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op); - GGML_API const char * ggml_op_symbol(enum ggml_op op); + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); - GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name - GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor); + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type); + GGML_API bool ggml_is_quantized(enum ggml_type type); // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); - GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars - GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor); - GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() - GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 - GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() + GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 + GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -847,7 +837,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); @@ -1561,7 +1551,7 @@ extern "C" { "use ggml_rope_ext_inplace instead"); // compute correction dims for YaRN RoPE scaling - GGML_CALL void ggml_rope_yarn_corr_dims( + void ggml_rope_yarn_corr_dims( int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 2982f00ec..286bec255 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1325,7 +1325,7 @@ add_library(ggml ../include/ggml-backend.h ggml.c ggml-alloc.c - ggml-backend.c + ggml-backend.cpp ggml-quants.c ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index b0d4141cc..470c922fe 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -9,145 +9,229 @@ extern "C" { #endif // - // Backend buffer + // Backend buffer type // - // buffer type - typedef void * ggml_backend_buffer_type_context_t; - struct ggml_backend_buffer_type_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + const char * (*get_name) (ggml_backend_buffer_type_t buft); // allocate a buffer of this type - ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); // tensor alignment - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); - // max buffer size that can be allocated - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); - // data size needed to allocate the tensor, including padding - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); - // check if tensor data is in host memory - bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); + size_t (*get_alignment) (ggml_backend_buffer_type_t buft); + // (optional) max buffer size that can be allocated (defaults to SIZE_MAX) + size_t (*get_max_size) (ggml_backend_buffer_type_t buft); + // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) + size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); + // (optional) check if tensor data is in host memory (defaults to false) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { struct ggml_backend_buffer_type_i iface; - ggml_backend_buffer_type_context_t context; + ggml_backend_dev_t device; + void * context; }; - // buffer - typedef void * ggml_backend_buffer_context_t; + // + // Backend buffer + // struct ggml_backend_buffer_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); - void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer); - void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); - void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + const char * (*get_name) (ggml_backend_buffer_t buffer); + // (optional) free the buffer + void (*free_buffer) (ggml_backend_buffer_t buffer); + // base address of the buffer + void * (*get_base) (ggml_backend_buffer_t buffer); + // (optional) initialize a tensor in the buffer (eg. add tensor extras) + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + // tensor data access + void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + // clear the entire buffer + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + // (optional) reset any internal state due to tensor initialization, such as tensor extras + void (*reset) (ggml_backend_buffer_t buffer); }; struct ggml_backend_buffer { struct ggml_backend_buffer_i iface; ggml_backend_buffer_type_t buft; - ggml_backend_buffer_context_t context; + void * context; size_t size; enum ggml_backend_buffer_usage usage; }; - GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size); + ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + void * context, + size_t size); // do not use directly, use ggml_backend_tensor_copy instead bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + // multi-buffer // buffer that contains a collection of buffers - GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // - // Backend + // Backend (stream) // - typedef void * ggml_backend_context_t; - struct ggml_backend_i { - const char * (*GGML_CALL get_name)(ggml_backend_t backend); + const char * (*get_name)(ggml_backend_t backend); - void (*GGML_CALL free)(ggml_backend_t backend); + void (*free)(ggml_backend_t backend); // buffer allocation - ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchronous tensor data access - void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations - void (*GGML_CALL synchronize)(ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); - // compute graph with a plan (not used currently) - // create a new plan for a graph - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // (optional) compute graph with a plan (not used currently) + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology - void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); + void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); // compute the graph with the plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // compute graph (always async if supported by the backend) + enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface + // new backends should implement the device interface instead + + // These functions are being moved to the device interface // check if the backend can compute an operation - bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op); // check if the backend can use tensors allocated in a buffer type - bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // these should be expensive operations with large batch sizes that may benefit from running on this backend // even if the weight has to be copied from the CPU temporarily - bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op); // (optional) event synchronization - // create a new event that can record events on this backend instance - ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); - void (*GGML_CALL event_free) (ggml_backend_event_t event); - // record an event on the backend instance that created it - void (*GGML_CALL event_record) (ggml_backend_event_t event); - // wait for an event on on a different backend instance - void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); - // block until an event is recorded - void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); + // record an event on this stream + void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); + // wait for an event on on a different stream + void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); }; struct ggml_backend { ggml_guid_t guid; - struct ggml_backend_i iface; - ggml_backend_context_t context; + ggml_backend_dev_t device; + void * context; }; struct ggml_backend_event { - ggml_backend_t backend; + struct ggml_backend_device * device; void * context; }; // - // Backend registry + // Backend device // - typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); + // Note: if additional properties are needed, we should add a struct with all of them + // the current functions to obtain the properties can remain, since they are more convenient for often used properties + struct ggml_backend_device_i { + // device name: short identifier for this device, such as "CPU" or "CUDA0" + const char * (*get_name)(ggml_backend_dev_t dev); - GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + // device description: short informative description of the device, could be the model name + const char * (*get_description)(ggml_backend_dev_t dev); + + // device memory in bytes + void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); + + // device type + enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev); + + // device properties + void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props); + + // backend (stream) initialization + ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params); + + // preferred buffer type + ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev); + + // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device) + ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev); + + // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries) + ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size); + + // check if the backend can compute an operation + bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); + + // check if the backend can use tensors allocated in a buffer type + bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft); + + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer + // these should be expensive operations with large batch sizes that may benefit from running on this backend + // even if the weight has to be copied from the CPU temporarily + bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op); + + // (optional) event synchronization + ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); + void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); + void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); + }; + + struct ggml_backend_device { + struct ggml_backend_device_i iface; + ggml_backend_reg_t reg; + void * context; + }; + + // + // Backend (reg) + // + + struct ggml_backend_reg_i { + const char * (*get_name)(ggml_backend_reg_t reg); + + // enumerate available devices + size_t (*get_device_count)(ggml_backend_reg_t reg); + ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index); + + // (optional) get a pointer to a function in the backend + // backends can add custom functions that are not part of the standard ggml-backend interface + void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name); + + // (optional) set the log callback for the backend + void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data); + }; + + struct ggml_backend_reg { + // int api_version; // TODO: for dynamic loading + struct ggml_backend_reg_i iface; + void * context; + }; + + + // Internal backend registry API + void ggml_backend_register(ggml_backend_reg_t reg); + void ggml_backend_device_register(ggml_backend_dev_t device); + // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function + // typedef ggml_backend_register_t * (*ggml_backend_init)(void); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.cpp similarity index 77% rename from ggml/src/ggml-backend.c rename to ggml/src/ggml-backend.cpp index ba280e064..73a2b24f8 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.cpp @@ -1,3 +1,5 @@ +// Note: porting this file to C++ is a work in progress + #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" @@ -9,8 +11,7 @@ #include #include - -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#include // backend buffer type @@ -18,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } -GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -34,7 +35,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { return SIZE_MAX; } -GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { size_t size = buft->iface.get_alloc_size(buft, tensor); @@ -51,16 +52,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { return false; } +ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) { + return buft->device; +} + // backend buffer -GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size) { - ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); - - (*buffer) = (struct ggml_backend_buffer) { +ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + void * context, + size_t size) { + ggml_backend_buffer_t buffer = new ggml_backend_buffer { /* .interface = */ iface, /* .buft = */ buft, /* .context = */ context, @@ -83,7 +86,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } - free(buffer); + delete buffer; } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { @@ -98,14 +101,14 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return base; } -GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } } -size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { +size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } @@ -218,7 +221,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } } -GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -232,7 +235,7 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * buf->iface.set_tensor(buf, tensor, data, offset, size); } -GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -246,7 +249,7 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * buf->iface.get_tensor(buf, tensor, data, offset, size); } -GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); @@ -299,20 +302,39 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_supports_op(backend->device, op); + } + return backend->iface.supports_op(backend, op); } bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_supports_buft(backend->device, buft); + } + return backend->iface.supports_buft(backend, buft); } bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { + // helper to ease transition to device interface + if (backend->device) { + return ggml_backend_dev_offload_op(backend->device, op); + } + if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); } return false; } +ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { + return backend->device; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -375,30 +397,31 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b // events -ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) { - if (backend->iface.event_new == NULL) { +ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) { + // null device is allowed for the transition period to the device interface + if (device == NULL || device->iface.event_new == NULL) { return NULL; } - return backend->iface.event_new(backend); + return device->iface.event_new(device); } void ggml_backend_event_free(ggml_backend_event_t event) { if (event == NULL) { return; } - event->backend->iface.event_free(event); + event->device->iface.event_free(event->device, event); } -void ggml_backend_event_record(ggml_backend_event_t event) { - GGML_ASSERT(event->backend->iface.event_record != NULL); +void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) { + GGML_ASSERT(backend->iface.event_record != NULL); - event->backend->iface.event_record(event); + backend->iface.event_record(backend, event); } void ggml_backend_event_synchronize(ggml_backend_event_t event) { - GGML_ASSERT(event->backend->iface.event_synchronize != NULL); + GGML_ASSERT(event->device->iface.event_synchronize); - event->backend->iface.event_synchronize(event); + event->device->iface.event_synchronize(event->device, event); } void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { @@ -407,170 +430,236 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) backend->iface.event_wait(backend, event); } -// backend registry +// Backend device -#define GGML_REG_MAX_BACKENDS 64 +const char * ggml_backend_dev_name(ggml_backend_dev_t device) { + return device->iface.get_name(device); +} -struct ggml_backend_reg { - char name[128]; - ggml_backend_init_fn init_fn; - ggml_backend_buffer_type_t default_buffer_type; - void * user_data; -}; +const char * ggml_backend_dev_description(ggml_backend_dev_t device) { + return device->iface.get_description(device); +} -static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS]; -static size_t ggml_backend_registry_count = 0; +void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + device->iface.get_memory(device, free, total); +} -GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); +enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { + return device->iface.get_type(device); +} -GGML_CALL static void ggml_backend_registry_init(void) { - static bool initialized = false; +void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { + device->iface.get_props(device, props); +} - if (initialized) { - return; +ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { + return device->reg; +} + +ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) { + return device->iface.init_backend(device, params); +} + +ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { + return device->iface.get_buffer_type(device); +} + +ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { + return device->iface.get_host_buffer_type(device); +} + +ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { + return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size); +} + +bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + return device->iface.supports_op(device, op); +} + +bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { + return device->iface.supports_buft(device, buft); +} + +bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + return device->iface.offload_op(device, op); +} + +// Backend (reg) + +const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { + return reg->iface.get_name(reg); +} + +size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) { + return reg->iface.get_device_count(reg); +} + +ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) { + return reg->iface.get_device(reg, index); +} + +void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (!reg->iface.get_proc_address) { + return NULL; + } + return reg->iface.get_proc_address(reg, name); +} + +void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) { + if (reg->iface.set_log_callback) { + reg->iface.set_log_callback(reg, log_callback, user_data); + } +} + +// Backend registry + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +struct ggml_backend_registry { + std::vector backends; + std::vector devices; + + ggml_backend_registry() { +#ifdef GGML_USE_CUDA + register_backend(ggml_backend_cuda_reg()); +#endif + + register_backend(ggml_backend_cpu_reg()); + + // TODO: sycl, metal, vulkan, kompute, cann } - initialized = true; - - ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); - - // add forward decls here to avoid including the backend headers -#ifdef GGML_USE_CUDA - extern GGML_CALL void ggml_backend_cuda_reg_devices(void); - ggml_backend_cuda_reg_devices(); -#endif - -#ifdef GGML_USE_SYCL - extern void ggml_backend_sycl_reg_devices(void); - ggml_backend_sycl_reg_devices(); -#endif - -#ifdef GGML_USE_METAL - extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); - extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); -#endif - -#ifdef GGML_USE_VULKAN - extern GGML_CALL int ggml_backend_vk_reg_devices(void); - ggml_backend_vk_reg_devices(); -#endif - -#ifdef GGML_USE_KOMPUTE - extern GGML_CALL void ggml_backend_kompute_reg_devices(void); - ggml_backend_kompute_reg_devices(); -#endif - -#ifdef GGML_USE_CANN - extern GGML_CALL int ggml_backend_cann_reg_devices(void); - ggml_backend_cann_reg_devices(); -#endif -} - -GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { - GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS); - - size_t id = ggml_backend_registry_count; - - ggml_backend_registry[id] = (struct ggml_backend_reg) { - /* .name = */ {0}, - /* .fn = */ init_fn, - /* .default_buffer_type = */ default_buffer_type, - /* .user_data = */ user_data, - }; - - snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name); - + void register_backend(ggml_backend_reg_t reg) { #ifndef NDEBUG - fprintf(stderr, "%s: registered backend %s\n", __func__, name); + fprintf(stderr, "%s: registered backend %s (%zu devices)\n", + __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - - ggml_backend_registry_count++; -} - -size_t ggml_backend_reg_get_count(void) { - ggml_backend_registry_init(); - - return ggml_backend_registry_count; -} - -size_t ggml_backend_reg_find_by_name(const char * name) { - ggml_backend_registry_init(); - - for (size_t i = 0; i < ggml_backend_registry_count; i++) { - // TODO: case insensitive in a portable way - if (strcmp(ggml_backend_registry[i].name, name) == 0) { - return i; + backends.push_back(reg); + for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { + register_device(ggml_backend_reg_dev_get(reg, i)); } } - // not found - return SIZE_MAX; + void register_device(ggml_backend_dev_t device) { +#ifndef NDEBUG + fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); +#endif + devices.push_back(device); + } +}; + +static ggml_backend_registry & get_reg() { + static ggml_backend_registry reg; + return reg; } -// init from backend:params string -ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) { - ggml_backend_registry_init(); +// Internal API +void ggml_backend_register(ggml_backend_reg_t reg) { + get_reg().register_backend(reg); +} - const char * params = strchr(backend_str, ':'); - char backend_name[128]; - if (params == NULL) { - snprintf(backend_name, sizeof(backend_name), "%s", backend_str); - params = ""; - } else { - snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str); - params++; +void ggml_backend_device_register(ggml_backend_dev_t device) { + get_reg().register_device(device); +} + +// Backend (reg) enumeration +size_t ggml_backend_reg_count() { + return get_reg().backends.size(); +} + +ggml_backend_reg_t ggml_backend_reg_get(size_t index) { + GGML_ASSERT(index < ggml_backend_reg_count()); + return get_reg().backends[index]; +} + +ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + ggml_backend_reg_t reg = ggml_backend_reg_get(i); + if (strcmp(ggml_backend_reg_name(reg), name) == 0) { + return reg; + } } + return NULL; +} - size_t backend_i = ggml_backend_reg_find_by_name(backend_name); +// Device enumeration +size_t ggml_backend_dev_count() { + return get_reg().devices.size(); +} - if (backend_i == SIZE_MAX) { - fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name); +ggml_backend_dev_t ggml_backend_dev_get(size_t index) { + GGML_ASSERT(index < ggml_backend_dev_count()); + return get_reg().devices[index]; +} + +ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (strcmp(ggml_backend_dev_name(dev), name) == 0) { + return dev; + } + } + return NULL; +} + +ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == type) { + return dev; + } + } + return NULL; +} + +void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + ggml_backend_reg_t reg = ggml_backend_reg_get(i); + ggml_backend_reg_set_log_callback(reg, log_callback, user_data); + } +} + +// Convenience functions +ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { + ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); + if (!dev) { return NULL; } - - return ggml_backend_reg_init_backend(backend_i, params); + return ggml_backend_dev_init(dev, params); } -const char * ggml_backend_reg_get_name(size_t i) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].name; +ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); + if (!dev) { + return NULL; + } + return ggml_backend_dev_init(dev, params); } -ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data); -} - -ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_registry[i].default_buffer_type; -} - -ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { - ggml_backend_registry_init(); - - GGML_ASSERT(i < ggml_backend_registry_count); - return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size); +ggml_backend_t ggml_backend_init_best(void) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL); + if (!dev) { + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL); + } + if (!dev) { + return NULL; + } + return ggml_backend_dev_init(dev, NULL); } // backend CPU static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment -GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; GGML_UNUSED(buffer); } -GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; // align the buffer @@ -581,29 +670,29 @@ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t b return (void *)data; } -GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } -GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -613,12 +702,12 @@ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t b GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } -static struct ggml_backend_buffer_i cpu_backend_buffer_i = { - /* .get_name = */ ggml_backend_cpu_buffer_name, +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { + /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -630,9 +719,8 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .reset = */ NULL, }; -// for buffers from ptr, free is not called -static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { - /* .get_name = */ ggml_backend_cpu_buffer_name, +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { + /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -644,13 +732,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .reset = */ NULL, }; -GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; GGML_UNUSED(buft); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) if (data == NULL) { @@ -658,24 +746,24 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer return NULL; } - return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); } -GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { +ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { + /* .iface = */ { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, @@ -683,6 +771,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, }; @@ -695,23 +784,23 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include -GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU_HBM"; GGML_UNUSED(buft); } -GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { +static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { return "CPU_HBM"; GGML_UNUSED(buf); } -GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); @@ -749,27 +838,27 @@ struct ggml_backend_cpu_context { int n_threads; ggml_threadpool_t threadpool; - void * work_data; + uint8_t * work_data; size_t work_size; ggml_abort_callback abort_callback; void * abort_callback_data; }; -GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) { +static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - free(cpu_ctx->work_data); - free(cpu_ctx); - free(backend); + delete[] cpu_ctx->work_data; + delete cpu_ctx; + delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -780,18 +869,18 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; if (cpu_plan->cplan.work_data == NULL) { - free(cpu_plan); + delete cpu_plan; return NULL; } } @@ -802,16 +891,16 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg return cpu_plan; } -GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - free(cpu_plan->cplan.work_data); - free(cpu_plan); + delete[] cpu_plan->cplan.work_data; + delete cpu_plan; GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -819,21 +908,21 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { - free(cpu_ctx->work_data); - cpu_ctx->work_data = malloc(cplan.work_size); + delete[] cpu_ctx->work_data; + cpu_ctx->work_data = new uint8_t[cplan.work_size]; if (cpu_ctx->work_data == NULL) { cpu_ctx->work_size = 0; return GGML_STATUS_ALLOC_FAILED; } cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.work_data = (uint8_t *)cpu_ctx->work_data; cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; @@ -841,35 +930,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t return ggml_graph_compute(cgraph, &cplan); } -GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - switch (op->op) { - case GGML_OP_CPY: - return - op->type != GGML_TYPE_IQ2_XXS && - op->type != GGML_TYPE_IQ2_XS && - op->type != GGML_TYPE_IQ1_S && - op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float - case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; - case GGML_OP_ROPE_BACK: - return op->src[2] == NULL && (op->op_params[2] & 4) == 0; - case GGML_OP_IM2COL_BACK: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; - default: - return true; - } - - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i cpu_backend_i = { - /* .get_name = */ ggml_backend_cpu_name, +static const struct ggml_backend_i ggml_backend_cpu_i = { + /* .get_name = */ ggml_backend_cpu_get_name, /* .free = */ ggml_backend_cpu_free, /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, @@ -881,14 +943,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, - /* .supports_op = */ ggml_backend_cpu_supports_op, - /* .supports_buft = */ ggml_backend_cpu_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { @@ -897,7 +956,7 @@ static ggml_guid_t ggml_backend_cpu_guid(void) { } ggml_backend_t ggml_backend_cpu_init(void) { - struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; if (ctx == NULL) { return NULL; } @@ -909,21 +968,22 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; - ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t cpu_backend = new ggml_backend { + /* .guid = */ ggml_backend_cpu_guid(), + /* .interface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, + }; + if (cpu_backend == NULL) { - free(ctx); + delete ctx; return NULL; } - *cpu_backend = (struct ggml_backend) { - /* .guid = */ ggml_backend_cpu_guid(), - /* .interface = */ cpu_backend_i, - /* .context = */ ctx - }; return cpu_backend; } -GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) { +bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); } @@ -954,16 +1014,163 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_ ctx->abort_callback_data = abort_callback_data; } -GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); + return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } -GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { +//////////////////////// + +static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { + return "CPU"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { + // TODO + return "CPU"; + + GGML_UNUSED(dev); +} + +static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; + + GGML_UNUSED(dev); +} + +static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_cpu_device_get_name(dev); + props->description = ggml_backend_cpu_device_get_description(dev); + props->type = ggml_backend_cpu_device_get_type(dev); + ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ false, + /* events */ false, + }; +} + +static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { return ggml_backend_cpu_init(); + GGML_UNUSED(dev); GGML_UNUSED(params); - GGML_UNUSED(user_data); +} + +static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_CPY: + return + op->type != GGML_TYPE_IQ2_XXS && + op->type != GGML_TYPE_IQ2_XS && + op->type != GGML_TYPE_IQ1_S && + op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float + case GGML_OP_MUL_MAT: + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + case GGML_OP_ROPE_BACK: + return op->src[2] == NULL && (op->op_params[2] & 4) == 0; + case GGML_OP_IM2COL_BACK: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_OUT_PROD: + return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; + default: + return true; + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { + /* .get_name = */ ggml_backend_cpu_device_get_name, + /* .get_description = */ ggml_backend_cpu_device_get_description, + /* .get_memory = */ ggml_backend_cpu_device_get_memory, + /* .get_type = */ ggml_backend_cpu_device_get_type, + /* .get_props = */ ggml_backend_cpu_device_get_props, + /* .init_backend = */ ggml_backend_cpu_device_init, + /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_cpu_device_supports_op, + /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +//////////////////////// + +static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { + return "CPU"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device ggml_backend_cpu_device = { + /* .iface = */ ggml_backend_cpu_device_i, + /* .reg = */ reg, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { + /* .get_name = */ ggml_backend_cpu_reg_get_name, + /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, + /* .get_device = */ ggml_backend_cpu_reg_get_device, + /* .get_proc_address = */ NULL, + /* .set_log_callback = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_cpu_reg(void) { + static struct ggml_backend_reg ggml_backend_cpu_reg = { + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_reg; } // multi-buffer buffer @@ -973,16 +1180,14 @@ struct ggml_backend_multi_buffer_context { size_t n_buffers; }; -typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t; - -GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); } -GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_free(ctx->buffers[i]); } @@ -991,32 +1196,28 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_ free(ctx); } -GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_clear(ctx->buffers[i], value); } } -static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { - static struct ggml_backend_buffer_i multi_backend_buffer_i = { - /* .get_name = */ ggml_backend_multi_buffer_get_name, - /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, - /* .get_base = */ NULL, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, - /* .get_tensor = */ NULL, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_multi_buffer_clear, - /* .reset = */ NULL, - }; +static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { + /* .get_name = */ ggml_backend_multi_buffer_get_name, + /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, +}; - return multi_backend_buffer_i; -} - -GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context)); +ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context)); ctx->n_buffers = n_buffers; ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); @@ -1028,16 +1229,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back total_size += ggml_backend_buffer_get_size(buffers[i]); } - return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size); + return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size); } -GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; } -GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { +void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_set_usage(ctx->buffers[i], usage); } @@ -1592,7 +1793,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg i_split++; if (i_split >= sched->splits_capacity) { sched->splits_capacity *= 2; - sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + sched->splits = (ggml_backend_sched_split *) + realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); GGML_ASSERT(sched->splits != NULL); } split = &sched->splits[i_split]; @@ -1678,11 +1880,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; + int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; - sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); - sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); GGML_ASSERT(sched->graph.nodes != NULL); GGML_ASSERT(sched->graph.leafs != NULL); } @@ -1881,7 +2083,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // record the event of this copy if (split->n_inputs > 0) { if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]); + ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend); } } } @@ -1901,7 +2103,7 @@ ggml_backend_sched_t ggml_backend_sched_new( GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU - struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; sched->n_backends = n_backends; @@ -1910,21 +2112,21 @@ ggml_backend_sched_t ggml_backend_sched_new( // initialize hash table // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) sched->hash_set = ggml_hash_set_new(graph_size); - sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); - sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); - sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); - sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); + sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); + sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); - sched->context_buffer = malloc(sched->context_buffer_size); + sched->context_buffer = (char *) malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; - sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); + sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { @@ -1933,7 +2135,7 @@ ggml_backend_sched_t ggml_backend_sched_new( GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { - sched->events[b][c] = ggml_backend_event_new(backends[b]); + sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } } @@ -2169,8 +2371,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); - struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT - bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); + struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT + bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), @@ -2188,7 +2390,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, @@ -2211,7 +2413,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s free(node_init); ggml_free(ctx_allocated); ggml_free(ctx_unallocated); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ NULL, /* .ctx_allocated = */ NULL, /* .ctx_unallocated = */ NULL, @@ -2240,7 +2442,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s free(node_copies); free(node_init); - return (struct ggml_backend_graph_copy) { + return { /* .buffer = */ buffer, /* .ctx_allocated = */ ctx_allocated, /* .ctx_unallocated = */ ctx_unallocated, diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 6d99c6bea..b850e6a8d 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -235,25 +235,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g // backend interface -GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { +static const char * ggml_backend_blas_name(ggml_backend_t backend) { return "BLAS"; GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { +static void ggml_backend_blas_free(ggml_backend_t backend) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; delete ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -285,7 +285,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; @@ -300,7 +300,7 @@ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, cons GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { return ggml_backend_buft_is_host(buft); GGML_UNUSED(backend); @@ -322,11 +322,8 @@ static struct ggml_backend_i blas_backend_i = { /* .supports_op = */ ggml_backend_blas_supports_op, /* .supports_buft = */ ggml_backend_blas_supports_buft, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_blas_guid(void) { @@ -340,6 +337,7 @@ ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), /* .interface = */ blas_backend_i, + /* .device = */ nullptr, /* .context = */ ctx, }; @@ -356,7 +354,7 @@ ggml_backend_t ggml_backend_blas_init(void) { return backend; } -GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { +bool ggml_backend_is_blas(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); } diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index d3ab78006..63ad0b878 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -560,7 +560,7 @@ struct ggml_backend_cann_buffer_context { * @return A pointer to a C-string containing the name of the buffer. */ -GGML_CALL static const char* ggml_backend_cann_buffer_get_name( +static const char* ggml_backend_cann_buffer_get_name( ggml_backend_buffer_t buffer) { return "CANN"; @@ -576,7 +576,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name( * @param buffer The buffer to check. * @return true if the buffer is a CANN buffer, false otherwise. */ -GGML_CALL static bool ggml_backend_buffer_is_cann( +static bool ggml_backend_buffer_is_cann( ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; } @@ -589,7 +589,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann( * * @param buffer The CANN buffer to free. */ -GGML_CALL static void ggml_backend_cann_buffer_free_buffer( +static void ggml_backend_cann_buffer_free_buffer( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -605,7 +605,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer( * @param buffer The CANN buffer whose base pointer is to be retrieved. * @return A pointer to the base of the device memory allocated for the buffer. */ -GGML_CALL static void* ggml_backend_cann_buffer_get_base( +static void* ggml_backend_cann_buffer_get_base( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -625,9 +625,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, - const void* src, - void* dst) { +static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, + const void* src, + void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -677,7 +677,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q4.0 formatted data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back_q4_0( +static void ggml_backend_cann_transform_back_q4_0( const ggml_tensor* tensor, void* src, void* dst) { int64_t n_elems = ggml_nelements(tensor); @@ -726,9 +726,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, - const void* src, - void* dst) { +static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, + const void* src, + void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK8_0; size_t quant_bytes = n_elems * sizeof(uint8_t); @@ -760,7 +760,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q8.0 formatted data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back_q8_0( +static void ggml_backend_cann_transform_back_q8_0( const ggml_tensor* tensor, const void* src, void* dst) { int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK8_0; @@ -792,8 +792,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, - const void* src, void* dst) { +static void ggml_backend_cann_transform(ggml_tensor* tensor, + const void* src, void* dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: ggml_backend_cann_transform_q4_0(tensor, src, dst); @@ -818,7 +818,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where transformed tensor data * will be stored. */ -GGML_CALL static void ggml_backend_cann_transform_back( +static void ggml_backend_cann_transform_back( const ggml_tensor* tensor, void* src, void* dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: @@ -841,7 +841,7 @@ GGML_CALL static void ggml_backend_cann_transform_back( * @param type The tensor type to check. * @return true if transformation is needed, false otherwise. */ -GGML_CALL static bool need_transform(ggml_type type) { +static bool need_transform(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: @@ -860,7 +860,7 @@ GGML_CALL static bool need_transform(ggml_type type) { * @param buffer The CANN buffer from which to initialize the tensor. * @param tensor Pointer to the tensor to be initialized. */ -GGML_CALL static void ggml_backend_cann_buffer_init_tensor( +static void ggml_backend_cann_buffer_init_tensor( ggml_backend_buffer_t buffer, ggml_tensor* tensor) { if (tensor->view_src != NULL && tensor->view_offs == 0) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); @@ -896,7 +896,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor( * @param offset Offset in the source data from where to start copying. * @param size Size of the data to be copied, in bytes. */ -GGML_CALL static void ggml_backend_cann_buffer_set_tensor( +static void ggml_backend_cann_buffer_set_tensor( ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { ggml_backend_cann_buffer_context *ctx = @@ -941,7 +941,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( * @param offset Offset in the destination buffer where to start copying. * @param size Size of the data to be copied, in bytes. */ -GGML_CALL static void ggml_backend_cann_buffer_get_tensor( +static void ggml_backend_cann_buffer_get_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { ggml_backend_cann_buffer_context* ctx = @@ -975,7 +975,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor( * @param dst Pointer to the destination tensor where the data will be copied. * @return true if the copy operation succeeded, false otherwise. */ -GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( +static bool ggml_backend_cann_buffer_cpy_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { if (ggml_backend_buffer_is_cann(src->buffer)) { ggml_backend_cann_buffer_context* src_ctx = @@ -1017,7 +1017,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( * @param buffer The CANN buffer to be cleared. * @param value The value to which each byte in the buffer will be set. */ -GGML_CALL static void ggml_backend_cann_buffer_clear( +static void ggml_backend_cann_buffer_clear( ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; @@ -1065,7 +1065,7 @@ struct ggml_backend_cann_buffer_type_context { * @param buft Pointer to the buffer type context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char* ggml_backend_cann_buffer_type_name( +static const char* ggml_backend_cann_buffer_type_name( ggml_backend_buffer_type_t buft) { return "CANN"; @@ -1082,7 +1082,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name( * @param size Size in bytes of the buffer to allocate. * @return Pointer to the allocated buffer, or nullptr if allocation fails. */ -GGML_CALL static ggml_backend_buffer_t +static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cann_buffer_type_context* buft_ctx = @@ -1121,7 +1121,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, * @return The alignment requirement in bytes (fixed at 128 bytes for CANN * buffers). */ -GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( +static size_t ggml_backend_cann_buffer_type_get_alignment( ggml_backend_buffer_type_t buft) { return 128; @@ -1142,7 +1142,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( * @return The total allocation size in bytes required for the tensor in the * CANN buffer. */ -GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size( +static size_t ggml_backend_cann_buffer_type_get_alloc_size( ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -1193,7 +1193,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -GGML_CALL ggml_backend_buffer_type_t +ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -1231,7 +1231,7 @@ ggml_backend_cann_buffer_type(int32_t device) { * @param buft Pointer to the host buffer type context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return "CANN_Host"; GGML_UNUSED(buft); @@ -1246,7 +1246,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe * @param buft Pointer to the host buffer context. * @return Const pointer to the C-style string containing the name. */ -GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { return "CANN_Host"; GGML_UNUSED(buffer); @@ -1260,7 +1260,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu * * @param buffer The CANN host buffer to free. */ -GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { +static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { ACL_CHECK(aclrtFreeHost(buffer->context)); } @@ -1294,7 +1294,7 @@ static void * ggml_cann_host_malloc(size_t size) { * @param size Size in bytes of the host buffer to allocate. * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. */ -GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * hostPtr = ggml_cann_host_malloc(size); if (hostPtr == nullptr) { @@ -1316,7 +1316,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_ * Provides function pointers for allocating, querying properties, and managing * memory for CANN buffer types in the GGML backend. */ -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cann_host_buffer_type_name, @@ -1326,6 +1326,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ nullptr, /* .context = */ nullptr, }; @@ -1495,7 +1496,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, * @param backend Pointer to the CANN backend structure. * @return A pointer to a constant string representing the backend name. */ -GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { +static const char* ggml_backend_cann_name(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1510,7 +1511,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { * * @param backend Pointer to the CANN backend structure to be freed. */ -GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { +static void ggml_backend_cann_free(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtSynchronizeDevice()); @@ -1535,7 +1536,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { * @param backend Pointer to the CANN backend structure. * @return Pointer to the buffer type structure for the CANN backend. */ -GGML_CALL static ggml_backend_buffer_type_t +static ggml_backend_buffer_type_t ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1556,11 +1557,11 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { * @param offset Offset in bytes within the host data. * @param size Size of the data to copy in bytes. */ -GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, - ggml_tensor *tensor, - const void *data, - size_t offset, - size_t size) { +static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, + ggml_tensor *tensor, + const void *data, + size_t offset, + size_t size) { ggml_backend_cann_context *cann_ctx = (ggml_backend_cann_context *)backend->context; @@ -1587,7 +1588,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, } } -GGML_CALL static void ggml_backend_cann_get_tensor_async( +static void ggml_backend_cann_get_tensor_async( ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { ggml_backend_cann_context *cann_ctx = @@ -1626,7 +1627,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async( * @param dst Pointer to the destination tensor to copy data to. * @return true if the copy operation succeeds, false otherwise. */ -GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( +static bool ggml_backend_cann_cpy_tensor_async( ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor* src, ggml_tensor* dst) { GGML_ASSERT(ggml_backend_is_cann(backend_src) || @@ -1694,7 +1695,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( * * @param backend Pointer to the CANN backend structure to synchronize. */ -GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { +static void ggml_backend_cann_synchronize(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1715,7 +1716,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation * completes successfully, otherwise an appropriate error status. */ -GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( +static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_t backend, ggml_cgraph* cgraph) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; @@ -1753,7 +1754,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( * @return bool Returns true if the operation is supported by the backend, * otherwise false. */ -GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, +static bool ggml_backend_cann_supports_op(ggml_backend_t backend, const ggml_tensor* op) { switch (op->op) { case GGML_OP_UNARY: @@ -1875,7 +1876,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * @return bool Returns true if the CANN backend supports the buffer type, * otherwise false. */ -GGML_CALL static bool ggml_backend_cann_supports_buft( +static bool ggml_backend_cann_supports_buft( ggml_backend_t backend, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { ggml_backend_cann_context * cann_ctx = @@ -1901,7 +1902,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft( * @return bool Returns true if the operation should be offloaded, otherwise * false. */ -GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend, +static bool ggml_backend_cann_offload_op(ggml_backend_t backend, const ggml_tensor* op) { const int min_batch_size = 32; GGML_UNUSED(backend); @@ -2021,11 +2022,8 @@ static ggml_backend_i ggml_backend_cann_interface = { /* .supports_op = */ ggml_backend_cann_supports_op, /* .supports_buft = */ ggml_backend_cann_supports_buft, /* .offload_op = */ ggml_backend_cann_offload_op, - /* .event_new = */ ggml_backend_cann_event_new, - /* .event_free = */ ggml_backend_cann_event_free, /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, - /* .event_synchronize = */ ggml_backend_cann_event_synchronize, }; /** @@ -2042,7 +2040,7 @@ static ggml_guid_t ggml_backend_cann_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { +ggml_backend_t ggml_backend_cann_init(int32_t device) { aclInit(nullptr); if (device < 0 || device >= ggml_backend_cann_get_device_count()) { GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); @@ -2058,75 +2056,30 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, + /* .device = */ nullptr, /* .context = */ ctx}; return cann_backend; } -GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) { +bool ggml_backend_is_cann(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); } -GGML_CALL int32_t ggml_backend_cann_get_device_count() { +int32_t ggml_backend_cann_get_device_count() { return ggml_cann_info().device_count; } -GGML_CALL void ggml_backend_cann_get_device_description( +void ggml_backend_cann_get_device_description( int32_t device, char* description, size_t description_size) { ggml_cann_set_device(device); const char* soc_name = aclrtGetSocName(); snprintf(description, description_size, "%s", soc_name); } -GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, - size_t* total) { +void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, + size_t* total) { ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } - -// backend registry -/** - * @brief Initializes a CANN backend based on the provided parameters. - * - * This function initializes a CANN backend using the device index and then - * initializes the backend using `ggml_backend_cann_init`. - * - * @param params Parameters for initialization (unused in this implementation). - * @param user_data User data containing the device index to initialize the - * backend. - * @return ggml_backend_t The initialized CANN backend. - */ -GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params, - void* user_data) { - ggml_backend_t cann_backend = - ggml_backend_cann_init((int)(intptr_t)user_data); - return cann_backend; - - GGML_UNUSED(params); -} - -extern "C" GGML_CALL int ggml_backend_cann_reg_devices(); - -/** - * @brief Registers CANN (Ascend) devices as backend options. - * - * This function initializes ACL, retrieves the number of available CANN - * devices, and registers each device as a backend option using - * `ggml_backend_register`. Each device is given a unique name based on - * `GGML_CANN_NAME` followed by its index. - * - * @return int The number of CANN devices registered. - */ -GGML_CALL int ggml_backend_cann_reg_devices() { - uint32_t device_count = ggml_backend_cann_get_device_count(); - // initialization - for (uint32_t i = 0; i < device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "CANN%d", i); - ggml_backend_register(name, ggml_backend_reg_cann_init, - ggml_backend_cann_buffer_type(i), - (void*)(intptr_t)i); - } - return device_count; -} diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 6efdab14c..43151e235 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -99,11 +99,11 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); - GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg); + GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" %s\n", stmt); - // abort with GGML_ASSERT to get a stack trace - GGML_ABORT("CUDA error"); + // abort with GGML_ABORT to get a stack trace + GGML_ABORT(GGML_CUDA_NAME " error"); } // this is faster on Windows @@ -327,7 +327,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -457,26 +457,26 @@ struct ggml_backend_cuda_buffer_context { } }; -GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; } -GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } -GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL) { @@ -496,7 +496,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } } -GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -504,7 +504,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -512,7 +512,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -520,7 +520,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; @@ -541,7 +541,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -550,7 +550,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe CUDA_CHECK(cudaDeviceSynchronize()); } -static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { +static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, @@ -569,17 +569,17 @@ struct ggml_backend_cuda_buffer_type_context { std::string name; }; -GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) { ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; return ctx->name.c_str(); } static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; + return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); @@ -600,13 +600,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -621,8 +621,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen GGML_UNUSED(buft); } -static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_buffer_type_name, +static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX @@ -630,7 +630,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -643,9 +643,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { static bool ggml_backend_cuda_buffer_type_initialized = false; if (!ggml_backend_cuda_buffer_type_initialized) { - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { + for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) { ggml_backend_cuda_buffer_types[i] = { /* .iface = */ ggml_backend_cuda_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i), /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, }; } @@ -715,7 +716,7 @@ struct ggml_backend_cuda_split_buffer_context { std::vector tensor_extras; }; -GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Split"; GGML_UNUSED(buffer); @@ -726,19 +727,19 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds } -GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced return (void *)0x1000; GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -786,7 +787,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu tensor->extra = extra; } -GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -824,7 +825,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf } } -GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -862,12 +863,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf } } -GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { GGML_UNUSED(buffer); GGML_UNUSED(value); } -static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { +static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, @@ -882,17 +883,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { // cuda split buffer type -GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Split"; GGML_UNUSED(buft); } static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; + return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, @@ -902,13 +903,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; GGML_UNUSED(buft); } -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -935,14 +936,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; GGML_UNUSED(buft); } -static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, +static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX @@ -950,7 +951,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -979,6 +980,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, }; @@ -988,19 +990,19 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f // host buffer type -GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Host"; GGML_UNUSED(buft); } -GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Host"; GGML_UNUSED(buffer); } -GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -1022,7 +1024,7 @@ static void * ggml_cuda_host_malloc(size_t size) { return ptr; } -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { @@ -1038,7 +1040,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_ return buffer; } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, @@ -1048,6 +1050,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ nullptr, }; @@ -2375,26 +2378,26 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg // backend -GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return cuda_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) { +static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -2403,7 +2406,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); } -GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -2412,7 +2415,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); } -GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; @@ -2467,7 +2470,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ return true; } -GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); @@ -2526,7 +2529,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); @@ -2798,8 +2801,187 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t return GGML_STATUS_SUCCESS; } -GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; +static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream())); +} + +static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + if (ggml_backend_is_cuda(backend)) { + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0)); + } else { +#if 0 + // untested + auto wait_fn = [](void * user_data) { + ggml_backend_event_t event = (ggml_backend_event_t)user_data; + ggml_backend_event_synchronize(event); + }; + + CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); +#endif + GGML_ABORT("fatal error"); + } +} + +static const ggml_backend_i ggml_backend_cuda_interface = { + /* .get_name = */ ggml_backend_cuda_get_name, + /* .free = */ ggml_backend_cuda_free, + /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ NULL, // moved to device + /* .supports_buft = */ NULL, // moved to device + /* .offload_op = */ NULL, // moved to device + /* .event_record = */ ggml_backend_cuda_event_record, + /* .event_wait = */ ggml_backend_cuda_event_wait, +}; + +static ggml_guid_t ggml_backend_cuda_guid() { + static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; + return &guid; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); +} + +int ggml_backend_cuda_get_device_count() { + return ggml_cuda_info().device_count; +} + +void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); +} + +void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { + ggml_cuda_set_device(device); + + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { + if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { + return false; + } + +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) + cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + + GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, cudaGetErrorString(err)); + return false; + } + return true; +#else + return false; +#endif +} + +void ggml_backend_cuda_unregister_host_buffer(void * buffer) { + if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { + return; + } + + cudaError_t err = cudaHostUnregister(buffer); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + } +} + + +// backend device + +struct ggml_backend_cuda_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_cuda_device_get_name(dev); + props->description = ggml_backend_cuda_device_get_description(dev); + props->type = ggml_backend_cuda_device_get_type(dev); + ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; +#ifdef GGML_CUDA_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* async */ true, + /* host_buffer */ host_buffer, + /* events */ events, + }; +} + +static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ggml_backend_cuda_init(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + return ggml_backend_cuda_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_cuda_host_buffer_type(); +} + +static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +// TODO: move these functions here +static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -3004,7 +3186,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) { return true; } - const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; } case GGML_OP_CROSS_ENTROPY_LOSS: @@ -3014,115 +3196,170 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons default: return false; } - - GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cuda_split(buft)) { return true; } if (ggml_backend_buft_is_cuda(buft)) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - return buft_ctx->device == cuda_ctx->device; + return buft_ctx->device == dev_ctx->device; } return false; } -GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - GGML_UNUSED(backend); + GGML_UNUSED(dev); } -static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { +static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) { #ifdef GGML_CUDA_NO_PEER_COPY return nullptr; #else - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; - ggml_cuda_set_device(cuda_ctx->device); + ggml_cuda_set_device(dev_ctx->device); cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); return new ggml_backend_event { - /* .backend = */ backend, + /* .device = */ dev, /* .context = */ event, }; #endif } -static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { - CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); +static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + GGML_UNUSED(dev); + CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); delete event; } -static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; - - CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream())); -} - -static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - if (ggml_backend_is_cuda(event->backend)) { - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0)); - } else { -#if 0 - // untested - auto wait_fn = [](void * user_data) { - ggml_backend_event_t event = (ggml_backend_event_t)user_data; - ggml_backend_event_synchronize(event); - }; - - CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); -#endif - GGML_ABORT("fatal error"); - } -} - -static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) { +static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + GGML_UNUSED(dev); CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); } -static ggml_backend_i ggml_backend_cuda_interface = { - /* .get_name = */ ggml_backend_cuda_name, - /* .free = */ ggml_backend_cuda_free, - /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, - /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, - /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, - /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, - /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_cuda_graph_compute, - /* .supports_op = */ ggml_backend_cuda_supports_op, - /* .supports_buft = */ ggml_backend_cuda_supports_buft, - /* .offload_op = */ ggml_backend_cuda_offload_op, - /* .event_new = */ ggml_backend_cuda_event_new, - /* .event_free = */ ggml_backend_cuda_event_free, - /* .event_record = */ ggml_backend_cuda_event_record, - /* .event_wait = */ ggml_backend_cuda_event_wait, - /* .event_synchronize = */ ggml_backend_cuda_event_synchronize, +static const ggml_backend_device_i ggml_backend_cuda_device_interface = { + /* .get_name = */ ggml_backend_cuda_device_get_name, + /* .get_description = */ ggml_backend_cuda_device_get_description, + /* .get_memory = */ ggml_backend_cuda_device_get_memory, + /* .get_type = */ ggml_backend_cuda_device_get_type, + /* .get_props = */ ggml_backend_cuda_device_get_props, + /* .init_backend = */ ggml_backend_cuda_device_init, + /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_cuda_device_supports_op, + /* .supports_buft = */ ggml_backend_cuda_device_supports_buft, + /* .offload_op = */ ggml_backend_cuda_device_offload_op, + /* .event_new = */ ggml_backend_cuda_device_event_new, + /* .event_free = */ ggml_backend_cuda_device_event_free, + /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, }; -static ggml_guid_t ggml_backend_cuda_guid() { - static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; - return &guid; +// backend reg + +struct ggml_backend_cuda_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_CUDA_NAME; } -GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { +static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) { + ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context; + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; +} + +static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_cuda_split_buffer_type; + } + if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { + return (void *)ggml_backend_cuda_register_host_buffer; + } + if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { + return (void *)ggml_backend_cuda_unregister_host_buffer; + } + return nullptr; +} + +static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) { + GGML_UNUSED(reg); + ggml_backend_cuda_log_set_callback(log_callback, user_data); +} + +static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { + /* .get_name = */ ggml_backend_cuda_reg_get_name, + /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, + /* .get_device_get = */ ggml_backend_cuda_reg_get_device, + /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, + /* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback, +}; + +// backend registry +ggml_backend_reg_t ggml_backend_cuda_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; + + for (int i = 0; i < ggml_cuda_info().device_count; i++) { + ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; + dev_ctx->device = i; + dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); + + ggml_cuda_set_device(i); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); + dev_ctx->description = prop.name; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} + +ggml_backend_t ggml_backend_cuda_init(int device) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; @@ -3137,82 +3374,9 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { ggml_backend_t cuda_backend = new ggml_backend { /* .guid = */ ggml_backend_cuda_guid(), /* .interface = */ ggml_backend_cuda_interface, - /* .context = */ ctx + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), + /* .context = */ ctx, }; return cuda_backend; } - -GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); -} - -GGML_CALL int ggml_backend_cuda_get_device_count() { - return ggml_cuda_info().device_count; -} - -GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - snprintf(description, description_size, "%s", prop.name); -} - -GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { - ggml_cuda_set_device(device); - - CUDA_CHECK(cudaMemGetInfo(free, total)); -} - -GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { - if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { - return false; - } - -#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) - cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); - if (err != cudaSuccess) { - // clear the error - cudaGetLastError(); - - GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, - size / 1024.0 / 1024.0, cudaGetErrorString(err)); - return false; - } - return true; -#else - return false; -#endif -} - -GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { - if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { - return; - } - - cudaError_t err = cudaHostUnregister(buffer); - if (err != cudaSuccess) { - // clear the error - cudaGetLastError(); - } -} - -// backend registry -GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { - ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); - return cuda_backend; - - GGML_UNUSED(params); -} - -extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); - -GGML_CALL int ggml_backend_cuda_reg_devices() { - int device_count = ggml_backend_cuda_get_device_count(); - //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization - for (int i = 0; i < device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i); - ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i); - } - return device_count; -} diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 9cbc57a64..2c926aaee 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1921,6 +1921,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { for (const auto & dev : devices) { vec.push_back({ /* .iface = */ ggml_backend_kompute_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc) }); } @@ -1989,11 +1990,8 @@ static struct ggml_backend_i kompute_backend_i = { /* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_buft = */ ggml_backend_kompute_supports_buft, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_kompute_guid() { @@ -2008,6 +2006,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) { ggml_backend_t kompute_backend = new ggml_backend { /* .guid = */ ggml_backend_kompute_guid(), /* .interface = */ kompute_backend_i, + /* .device = */ nullptr, /* .context = */ s_kompute_context, }; @@ -2017,23 +2016,3 @@ ggml_backend_t ggml_backend_kompute_init(int device) { bool ggml_backend_is_kompute(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); } - -static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) { - GGML_UNUSED(params); - return ggml_backend_kompute_init(intptr_t(user_data)); -} - -extern "C" int ggml_backend_kompute_reg_devices(); - -int ggml_backend_kompute_reg_devices() { - auto devices = ggml_vk_available_devices_internal(0); - for (const auto & device : devices) { - ggml_backend_register( - ggml_kompute_format_name(device.index).c_str(), - ggml_backend_reg_kompute_init, - ggml_backend_kompute_buffer_type(device.index), - reinterpret_cast(intptr_t(device.index)) - ); - } - return devices.size(); -} diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 9da08fe2e..8ff16983e 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3202,13 +3202,13 @@ static void ggml_backend_metal_free_device(void) { } } -GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { return "Metal"; UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -3227,25 +3227,25 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_ free(ctx); } -GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -3255,7 +3255,7 @@ GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t UNUSED(buffer); } -GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -3276,7 +3276,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { // default buffer type -GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; UNUSED(buft); @@ -3307,7 +3307,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s UNUSED(size_aligned); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -3349,12 +3349,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } -GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { id device = ggml_backend_metal_get_device(); size_t max_size = device.maxBufferLength; ggml_backend_metal_free_device(); @@ -3364,13 +3364,13 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend UNUSED(buft); } -GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, @@ -3380,6 +3380,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, + /* .device = */ NULL, /* .context = */ NULL, }; @@ -3388,7 +3389,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { // buffer from ptr -GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = data; @@ -3468,37 +3469,37 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // backend -GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { +static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { +static void ggml_backend_metal_free(ggml_backend_t backend) { struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } -GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); } -GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; UNUSED(backend); @@ -3539,11 +3540,8 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .supports_op = */ ggml_backend_metal_supports_op, /* .supports_buft = */ ggml_backend_metal_supports_buft, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { @@ -3568,6 +3566,7 @@ ggml_backend_t ggml_backend_metal_init(void) { *backend = (struct ggml_backend) { /* .guid = */ ggml_backend_metal_guid(), /* .interface = */ ggml_backend_metal_i, + /* .device = */ NULL, /* .context = */ ctx, }; @@ -3604,9 +3603,9 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { ctx->capture_next_compute = true; } -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning +ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { +ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { return ggml_backend_metal_init(); GGML_UNUSED(params); diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 49b3fa911..ab7298cba 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -319,12 +319,12 @@ static std::shared_ptr get_socket(const std::string & endpoint) { return sock; } -GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // input serialization format: | remote_ptr (8 bytes) | std::vector input(sizeof(uint64_t), 0); @@ -337,7 +337,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t delete ctx; } -GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) { return ctx->base_cache[buffer]; @@ -388,7 +388,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { return result; } -GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { UNUSED(buffer); if (ggml_is_quantized(tensor->type)) { // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized @@ -396,7 +396,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t } } -GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) | size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size; @@ -410,7 +410,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b GGML_ASSERT(status); } -GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) | int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t); @@ -427,7 +427,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t b memcpy(data, output.data(), size); } -GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { // check if src and dst are on the same server ggml_backend_buffer_t src_buffer = src->buffer; ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context; @@ -452,7 +452,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b return output[0]; } -GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // serialization format: | bufptr (8 bytes) | value (1 byte) | int input_size = sizeof(uint64_t) + sizeof(uint8_t); @@ -477,12 +477,12 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { /* .reset = */ NULL, }; -GGML_CALL static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->name.c_str(); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; // input serialization format: | size (8 bytes) | int input_size = sizeof(uint64_t); @@ -522,7 +522,7 @@ static size_t get_alignment(const std::shared_ptr & sock) { return alignment; } -GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->alignment; } @@ -540,12 +540,12 @@ static size_t get_max_size(const std::shared_ptr & sock) { return max_size; } -GGML_CALL static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; return buft_ctx->max_size; } -GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { UNUSED(buft); return ggml_nbytes(tensor); } @@ -559,24 +559,24 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) { +static const char * ggml_backend_rpc_name(ggml_backend_t backend) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; return rpc_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) { +static void ggml_backend_rpc_free(ggml_backend_t backend) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; delete rpc_ctx; delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context; return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); } -GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { +static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { UNUSED(backend); // this is no-op because we don't have any async operations } @@ -618,7 +618,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector & o memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); } -GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; std::vector input; serialize_graph(cgraph, input); @@ -630,14 +630,14 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t return (enum ggml_status)output[0]; } -GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { UNUSED(backend); UNUSED(op); //TODO: call the remote backend and cache the results return true; } -GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { return false; } @@ -662,14 +662,11 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .supports_op = */ ggml_backend_rpc_supports_op, /* .supports_buft = */ ggml_backend_rpc_supports_buft, /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { +GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { static std::mutex mutex; std::lock_guard lock(mutex); // NOTE: buffer types are allocated and never freed; this is by design @@ -694,13 +691,14 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type { /* .iface = */ ggml_backend_rpc_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ buft_ctx }; buft_map[endpoint] = buft; return buft; } -GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { +ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { /* .endpoint = */ endpoint, /* .name = */ "RPC[" + std::string(endpoint) + "]", @@ -709,12 +707,13 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_rpc_guid(), /* .interface = */ ggml_backend_rpc_interface, + /* .device = */ nullptr, /* .context = */ ctx }; return backend; } -GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) { +GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid()); } @@ -734,7 +733,7 @@ static void get_device_memory(const std::shared_ptr & sock, size_t * f *total = total_mem; } -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { +GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { auto sock = get_socket(endpoint); if (sock == nullptr) { *free = 0; diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 6978a3192..4d3f1c5ce 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4038,7 +4038,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens return true; } -GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { +GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n"); for(int i=0;icontext; return ctx->name.c_str(); } -GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { +static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; } @@ -4162,7 +4162,7 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { return ctx->dev_ptr; } -GGML_CALL static void +static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; @@ -4237,7 +4237,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static bool +static bool ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) try { @@ -4339,12 +4339,12 @@ struct ggml_backend_sycl_buffer_type_context { queue_ptr stream = nullptr; }; -GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; return ctx->name.c_str(); } -GGML_CALL static ggml_backend_buffer_t +static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; @@ -4368,7 +4368,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } @@ -4379,7 +4379,7 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ UNUSED(buft); } -GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -4424,6 +4424,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { queue_ptr stream = &(device_i.default_queue()); ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream}, }; } @@ -4449,6 +4450,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte for (int i = 0; i < ggml_sycl_info().device_count; i++) { ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)}, }; } @@ -4513,7 +4515,7 @@ struct ggml_backend_sycl_split_buffer_context { std::vector streams; }; -GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { return GGML_SYCL_NAME "_Split"; UNUSED(buffer); @@ -4523,19 +4525,19 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name; } -GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; } -GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced return (void *)0x1000; UNUSED(buffer); } -GGML_CALL static void +static void ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported @@ -4618,7 +4620,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static void +static void ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { @@ -4671,7 +4673,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static void +static void ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { @@ -4724,7 +4726,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { UNUSED(buffer); UNUSED(value); } @@ -4742,13 +4744,13 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { /* .reset = */ NULL, }; -GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_SYCL_NAME "_Split"; UNUSED(buft); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, @@ -4758,12 +4760,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size); } -GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -4790,7 +4792,7 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; UNUSED(buft); @@ -4805,7 +4807,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { +ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { static std::mutex mutex; std::lock_guard lock(mutex); @@ -4837,6 +4839,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_sycl_split_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr}, }; @@ -4846,13 +4849,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f // host buffer type -GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_SYCL_NAME "_Host"; UNUSED(buft); } -GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_SYCL_NAME "_Host"; UNUSED(buffer); @@ -4890,6 +4893,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ nullptr, /* .context = */ nullptr, }; @@ -4898,14 +4902,14 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { // backend -GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) { +static const char * ggml_backend_sycl_name(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return sycl_ctx->name.c_str(); } -GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) { +static void ggml_backend_sycl_free(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; delete sycl_ctx; @@ -4913,12 +4917,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) { } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return ggml_backend_sycl_buffer_type(sycl_ctx->device); } -GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, +static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { @@ -4936,7 +4940,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, +static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { @@ -4954,9 +4958,9 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, - const ggml_tensor *src, - ggml_tensor *dst) try { +static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, + const ggml_tensor *src, + ggml_tensor *dst) try { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) { /* @@ -4991,7 +4995,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -5019,7 +5023,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back return GGML_STATUS_SUCCESS; } -GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_CONV_TRANSPOSE_1D: { @@ -5166,13 +5170,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons UNUSED(backend); } -GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) { return false; } @@ -5197,11 +5201,8 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .supports_op = */ ggml_backend_sycl_supports_op, /* .supports_buft = */ ggml_backend_sycl_supports_buft, /* .offload_op = */ ggml_backend_sycl_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_sycl_guid() { @@ -5209,7 +5210,7 @@ static ggml_guid_t ggml_backend_sycl_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { +ggml_backend_t ggml_backend_sycl_init(int device) { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n"); ggml_check_sycl(); @@ -5224,6 +5225,7 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { ggml_backend_t sycl_backend = new ggml_backend { /* .guid = */ ggml_backend_sycl_guid(), /* .interface = */ ggml_backend_sycl_interface, + /* .device = */ nullptr, /* .context = */ ctx }; @@ -5234,26 +5236,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid()); } -GGML_CALL int ggml_backend_sycl_get_device_count() { +int ggml_backend_sycl_get_device_count() { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); return ggml_sycl_info().device_count; } - -GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { - ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data); - return sycl_backend; - - UNUSED(params); -} - -extern "C" int ggml_backend_sycl_reg_devices(); - -int ggml_backend_sycl_reg_devices() { - assert(ggml_sycl_info().device_count>0); - for (int i = 0; i < ggml_sycl_info().device_count; i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i); - ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i); - } - return ggml_sycl_info().device_count; -} diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 789b24bdf..12ad9d810 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -119,11 +119,11 @@ struct ggml_backend_vk_buffer_type_context { vk_device device; }; -GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); +static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); +static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); +static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); +static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); +static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, @@ -622,7 +622,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor); typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); +static void ggml_backend_vk_free(ggml_backend_t backend); // variables to track number of compiles in progress static uint32_t compile_count = 0; @@ -1953,6 +1953,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->buffer_type = { /* .iface = */ ggml_backend_vk_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, }; @@ -6147,13 +6148,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->device->device.destroyFence(ctx->fence); } -GGML_CALL static int ggml_vk_get_device_count() { +static int ggml_vk_get_device_count() { ggml_vk_instance_init(); return vk_instance.device_indices.size(); } -GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { +static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { ggml_vk_instance_init(); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); @@ -6170,36 +6171,36 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript // device backend -GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; return ctx->name.c_str(); } -GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { +static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_vk_buffer_get_name; } -GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; } -GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { return vk_ptr_base; UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); } } -GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; @@ -6207,7 +6208,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -6216,7 +6217,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_vk(src->buffer)) { ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; @@ -6233,7 +6234,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); @@ -6253,13 +6254,13 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { }; // vk buffer type -GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; return ctx->name.c_str(); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; @@ -6275,23 +6276,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->properties.limits.minStorageBufferOffsetAlignment; } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->max_memory_allocation_size; } -GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(buft); } -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { +ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { ggml_vk_instance_init(); VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); @@ -6303,24 +6304,24 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) // host buffer type -GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_VK_NAME "_Host"; UNUSED(buft); } -GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { +static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_VK_NAME "_Host"; UNUSED(buffer); } -GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); } -GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type @@ -6344,7 +6345,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu UNUSED(buft); } -GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment; UNUSED(buft); @@ -6352,7 +6353,7 @@ GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_back // Should be changed to return device-specific host buffer type // but that probably requires changes in llama.cpp -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_vk_host_buffer_type_name, @@ -6362,6 +6363,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, + /* .device = */ nullptr, /* .context = */ nullptr, }; @@ -6375,13 +6377,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { // backend -GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { +static const char * ggml_backend_vk_name(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return ctx->name.c_str(); } -GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { +static void ggml_backend_vk_free(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); @@ -6391,13 +6393,13 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { delete backend; } -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { +static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return &ctx->device->buffer_type; } -GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -6420,7 +6422,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -6443,7 +6445,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } -GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { @@ -6471,7 +6473,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c return false; } -GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { +static void ggml_backend_vk_synchronize(ggml_backend_t backend) { VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if(ctx->transfer_ctx.expired()) { @@ -6501,7 +6503,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -6564,7 +6566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen UNUSED(backend); } -GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context; switch (op->op) { @@ -6687,7 +6689,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const UNUSED(backend); } -GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || @@ -6696,7 +6698,7 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g UNUSED(backend); } -GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { return false; } @@ -6724,11 +6726,8 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .supports_op = */ ggml_backend_vk_supports_op, /* .supports_buft = */ ggml_backend_vk_supports_buft, /* .offload_op = */ ggml_backend_vk_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_vk_guid() { @@ -6736,7 +6735,7 @@ static ggml_guid_t ggml_backend_vk_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { +ggml_backend_t ggml_backend_vk_init(size_t dev_num) { VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); ggml_backend_vk_context * ctx = new ggml_backend_vk_context; @@ -6745,25 +6744,26 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { ggml_backend_t vk_backend = new ggml_backend { /* .guid = */ ggml_backend_vk_guid(), /* .interface = */ ggml_backend_vk_interface, + /* .device = */ nullptr, /* .context = */ ctx, }; return vk_backend; } -GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { +bool ggml_backend_is_vk(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); } -GGML_CALL int ggml_backend_vk_get_device_count() { +int ggml_backend_vk_get_device_count() { return ggml_vk_get_device_count(); } -GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { +void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { ggml_vk_get_device_description(device, description, description_size); } -GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { +void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { GGML_ASSERT(device < (int) vk_instance.device_indices.size()); vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; @@ -6779,27 +6779,6 @@ GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size } } -// backend registry -GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) { - ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data); - return vk_backend; - - UNUSED(params); -} - -extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); - -GGML_CALL int ggml_backend_vk_reg_devices() { - ggml_vk_instance_init(); - - for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { - char name[128]; - snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i); - ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT - } - return vk_instance.device_indices.size(); -} - // Extension availability static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions) { #ifdef GGML_VULKAN_VALIDATE diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bcbc32d91..e740e58b2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -461,7 +461,7 @@ struct ggml_arm_arch_features_type { } ggml_arm_arch_features = {-1, -1, -1, 0}; #endif -GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { +const char * ggml_status_to_string(enum ggml_status status) { switch (status) { case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; case GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; @@ -3382,19 +3382,19 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) { +int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) { +int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { +size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { @@ -3417,15 +3417,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -GGML_CALL int64_t ggml_blck_size(enum ggml_type type) { +int64_t ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } -GGML_CALL size_t ggml_type_size(enum ggml_type type) { +size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) { +size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); return ggml_type_size(type)*ne/ggml_blck_size(type); } @@ -3434,15 +3434,15 @@ double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } -GGML_CALL const char * ggml_type_name(enum ggml_type type) { +const char * ggml_type_name(enum ggml_type type) { return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } -GGML_CALL bool ggml_is_quantized(enum ggml_type type) { +bool ggml_is_quantized(enum ggml_type type) { return type_traits[type].is_quantized; } -GGML_CALL const char * ggml_op_name(enum ggml_op op) { +const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } @@ -3454,7 +3454,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } -GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { +const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); @@ -3462,7 +3462,7 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { return ggml_op_name(t->op); } -GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) { +size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } @@ -3555,7 +3555,7 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) { +bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -3581,23 +3581,23 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { return true; } -GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { return ggml_is_contiguous_0(tensor); } -GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 0); } -GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 1); } -GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) { return ggml_is_contiguous_n(tensor, 2); } -GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) { +bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; @@ -3612,7 +3612,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) { +bool ggml_is_empty(const struct ggml_tensor * tensor) { for (int i = 0; i < GGML_MAX_DIMS; ++i) { if (tensor->ne[i] == 0) { // empty if any dimension has no elements @@ -4628,7 +4628,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } -GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { +enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } @@ -12731,6 +12731,10 @@ static void ggml_compute_forward_out_prod_f32( GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + const int ith = params->ith; const int nth = params->nth; @@ -14060,7 +14064,7 @@ static void ggml_rope_cache_init( } } -GGML_CALL void ggml_rope_yarn_corr_dims( +void ggml_rope_yarn_corr_dims( int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index f16336594..ffce2aab0 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -122,7 +122,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h - # src/ggml-backend.c -> ggml/src/ggml-backend.c + # src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp # src/ggml-cann/* -> ggml/src/ggml-cann/ # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp # src/ggml-common.h -> ggml/src/ggml-common.h @@ -169,7 +169,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 30a62e088..f6ff5e683 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -9,7 +9,7 @@ cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h -cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c +cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h diff --git a/src/llama.cpp b/src/llama.cpp index af19e5c86..69ba65395 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -12,9 +12,7 @@ # include "ggml-rpc.h" #endif -#ifdef GGML_USE_CUDA -# include "ggml-cuda.h" -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) # include "ggml-vulkan.h" #elif defined(GGML_USE_SYCL) # include "ggml-sycl.h" @@ -2264,51 +2262,13 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_ return piece; } -static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { - ggml_backend_buffer_type_t buft = nullptr; - -#if defined(GGML_USE_CUDA) - // host buffers should only be used when data is expected to be copied to/from the GPU - if (host_buffer) { - buft = ggml_backend_cuda_host_buffer_type(); - } -#elif defined(GGML_USE_SYCL) - if (host_buffer) { - buft = ggml_backend_sycl_host_buffer_type(); - } -#elif defined(GGML_USE_CANN) - if (host_buffer) { - buft = ggml_backend_cann_host_buffer_type(); - } -#elif defined(GGML_USE_CPU_HBM) - buft = ggml_backend_cpu_hbm_buffer_type(); -#elif defined(GGML_USE_VULKAN) - if (host_buffer) { - buft = ggml_backend_vk_host_buffer_type(); - } -#endif - - if (buft == nullptr) { - buft = ggml_backend_cpu_buffer_type(); - } - return buft; - - GGML_UNUSED(host_buffer); -} - // // globals // struct llama_state { llama_state() { -#ifdef GGML_USE_METAL - ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); -#elif defined(GGML_USE_CUDA) - ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); -#elif defined(GGML_USE_CANN) - ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data); -#endif + llama_log_set(log_callback, log_callback_user_data); } // We save the log callback globally @@ -2920,14 +2880,17 @@ struct llama_model { std::vector layers; + // gguf metadata + std::unordered_map gguf_kv; + llama_split_mode split_mode; int main_gpu; int n_gpu_layers; - std::vector rpc_servers; + // list of devices used in this model + std::vector devices; - // gguf metadata - std::unordered_map gguf_kv; + std::vector rpc_servers; // layer -> buffer type mapping struct layer_buft { @@ -2970,11 +2933,6 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { -#ifdef GGML_USE_CUDA - if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { - ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); - } -#endif ggml_backend_buffer_free(buf); } while (!lora_adapters.empty()) { @@ -3460,72 +3418,116 @@ struct llama_lora_adapter { } }; -static size_t llama_get_device_count(const llama_model & model) { - size_t count = 1; -#if defined(GGML_USE_CUDA) - count = ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - count = ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - count = ggml_backend_vk_get_device_count(); -#elif defined(GGML_USE_CANN) - return ggml_backend_cann_get_device_count(); -#endif +static int llama_get_device_count(const llama_model & model) { + int count = (int) model.devices.size(); + #if defined(GGML_USE_RPC) - count += model.rpc_servers.size(); + count += (int) model.rpc_servers.size(); #endif + +#if defined(GGML_USE_METAL) + count += 1; +#elif defined(GGML_USE_SYCL) + count += ggml_backend_sycl_get_device_count(); +#elif defined(GGML_USE_VULKAN) + count += ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_CANN) + count += ggml_backend_cann_get_device_count(); +#endif + return count; + GGML_UNUSED(model); } -static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { +static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_RPC - int rpc_count = (int)model.rpc_servers.size(); -#else - int rpc_count = 0; -#endif - int local_gpu = gpu - rpc_count; -#if defined(GGML_USE_RPC) - if (gpu < rpc_count) { - const char * endpoint = model.rpc_servers[gpu].c_str(); - return ggml_backend_rpc_buffer_type(endpoint); + if (host_buffer) { + for (auto * dev : model.devices) { + buft = ggml_backend_dev_host_buffer_type(dev); + if (buft != nullptr) { + break; + } + } } -#endif -#if defined(GGML_USE_METAL) - buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUDA) - buft = ggml_backend_cuda_buffer_type(local_gpu); -#elif defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(local_gpu); -#elif defined(GGML_USE_SYCL) - buft = ggml_backend_sycl_buffer_type(local_gpu); -#elif defined(GGML_USE_KOMPUTE) - buft = ggml_backend_kompute_buffer_type(local_gpu); - if (buft == nullptr) { - LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu); + +#if defined(GGML_USE_SYCL) + if (host_buffer) { + buft = ggml_backend_sycl_host_buffer_type(); } #elif defined(GGML_USE_CANN) - buft = ggml_backend_cann_buffer_type(local_gpu); + if (host_buffer) { + buft = ggml_backend_cann_host_buffer_type(); + } +#elif defined(GGML_USE_CPU_HBM) + buft = ggml_backend_cpu_hbm_buffer_type(); +#elif defined(GGML_USE_VULKAN) + if (host_buffer) { + buft = ggml_backend_vk_host_buffer_type(); + } #endif if (buft == nullptr) { - buft = llama_default_buffer_type_cpu(true); + buft = ggml_backend_cpu_buffer_type(); } return buft; + + GGML_UNUSED(host_buffer); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) { + ggml_backend_buffer_type_t buft = nullptr; + +#if defined(GGML_USE_RPC) + int rpc_count = (int)model.rpc_servers.size(); + if (device < rpc_count) { + const char * endpoint = model.rpc_servers[device].c_str(); + return ggml_backend_rpc_buffer_type(endpoint); + } + device -= rpc_count; +#endif + + if (device < (int)model.devices.size()) { + return ggml_backend_dev_buffer_type(model.devices[device]); + } + device -= (int)model.devices.size(); + +#if defined(GGML_USE_METAL) + buft = ggml_backend_metal_buffer_type(); +#elif defined(GGML_USE_VULKAN) + buft = ggml_backend_vk_buffer_type(device); +#elif defined(GGML_USE_SYCL) + buft = ggml_backend_sycl_buffer_type(device); +#elif defined(GGML_USE_KOMPUTE) + buft = ggml_backend_kompute_buffer_type(device); +#elif defined(GGML_USE_CANN) + buft = ggml_backend_cann_buffer_type(device); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(model, true); + } + return buft; + GGML_UNUSED(model); - GGML_UNUSED(local_gpu); } static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_CUDA - if (ggml_backend_cuda_get_device_count() > 1) { - buft = ggml_backend_cuda_split_buffer_type(tensor_split); + // find a backend that supports split buffers + for (size_t i = 0; i < ggml_backend_reg_count(); ++i) { + ggml_backend_reg_t reg = ggml_backend_reg_get(i); + + auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); + if (ggml_backend_split_buffer_type_fn) { + buft = ggml_backend_split_buffer_type_fn(tensor_split); + if (buft != nullptr) { + break; + } + } } -#endif #ifdef GGML_USE_SYCL if (ggml_backend_sycl_get_device_count() > 1) { @@ -3542,13 +3544,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo } static size_t llama_get_device_memory(const llama_model & model, int device) { -#ifdef GGML_USE_RPC - int rpc_count = (int)model.rpc_servers.size(); -#else - int rpc_count = 0; -#endif - int local_device = device - rpc_count; #if defined(GGML_USE_RPC) + int rpc_count = (int)model.rpc_servers.size(); if (device < rpc_count) { size_t total; size_t free; @@ -3556,32 +3553,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { ggml_backend_rpc_get_device_memory(endpoint, &free, &total); return free; } + device = device - rpc_count; #endif -#if defined(GGML_USE_CUDA) + + if (device < (int)model.devices.size()) { + ggml_backend_dev_t dev = model.devices[device]; + size_t total; + size_t free; + ggml_backend_dev_memory(dev, &free, &total); + return free; + } + +#if defined(GGML_USE_SYCL) size_t total; size_t free; - ggml_backend_cuda_get_device_memory(local_device, &free, &total); - return free; -#elif defined(GGML_USE_SYCL) - size_t total; - size_t free; - ggml_backend_sycl_get_device_memory(local_device, &free, &total); + ggml_backend_sycl_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_VULKAN) size_t total; size_t free; - ggml_backend_vk_get_device_memory(local_device, &free, &total); + ggml_backend_vk_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_CANN) size_t total; size_t free; - ggml_backend_cann_get_device_memory(local_device, &free, &total); + ggml_backend_cann_get_device_memory(device, &free, &total); return free; #else return 1; #endif GGML_UNUSED(model); - GGML_UNUSED(local_device); + GGML_UNUSED(device); } // @@ -3624,7 +3626,7 @@ static bool llama_kv_cache_init( buft_layer_count[model.buft_layer[i].buft]++; } } else { - buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer; } // create a context for each buffer type @@ -5037,7 +5039,7 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, - llama_buf_map & bufs_mmap, + llama_buf_map & bufs, llama_mlocks * lmlocks, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -5046,43 +5048,94 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; -#if defined(GGML_USE_CUDA) // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB std::vector host_buffers; - std::vector host_ptrs; std::vector events; + std::vector host_ptrs; size_t buffer_idx = 0; // buffer to use for async loads - - ggml_backend_t cuda_backend = nullptr; - if (!use_mmap && !check_tensors) { + ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t { + if (use_mmap || check_tensors) { + return nullptr; + } // When not using mmaped io use async uploads from pinned memory to GPU memory. - // First determine if the CUDA backend is active, and if so, determine the device ID. - ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr; - if (buf) { - ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf); - for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { - auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i); - if (buffer_type == cuda_buffer_type) { - cuda_backend = ggml_backend_cuda_init(i); - break; - } - } + // First determine if the backend supports the necessary features for async uploads. + auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; + if (!buf) { + LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn); + return nullptr; } - // If the cuda backend is active create pinned memory buffers and events for synchronisation. - if (cuda_backend) { - for (size_t idx = 0; idx < n_buffers; ++idx) { - host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); - events.emplace_back(ggml_backend_event_new(cuda_backend)); - } + auto * buft = ggml_backend_buffer_get_type(buf); + auto * dev = ggml_backend_buft_get_device(buft); + if (!dev) { + LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn, + ggml_backend_buft_name(buft)); + return nullptr; } + + if (buft != ggml_backend_dev_buffer_type(dev)) { + LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn, + ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); + return nullptr; + } + + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { + LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (!host_buft) { + LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + // If the backend is supported, create pinned memory buffers and events for synchronisation. + for (size_t idx = 0; idx < n_buffers; ++idx) { + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { + LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + host_buffers.emplace_back(buf); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + + auto * event = ggml_backend_event_new(dev); + if (!event) { + LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + events.emplace_back(event); + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (!backend) { + LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn, + ggml_backend_dev_name(dev)); + return nullptr; + } + + return backend; + }(__func__); + + if (upload_backend) { + LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, + ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), + ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), + ggml_backend_name(upload_backend)); } -#endif for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); @@ -5102,8 +5155,8 @@ struct llama_model_loader { if (use_mmap) { const auto & mapping = mappings.at(weight->idx); ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs_mmap.count(weight->idx)) { - buf_mmap = bufs_mmap.at(weight->idx); + if (bufs.count(weight->idx)) { + buf_mmap = bufs.at(weight->idx); } uint8_t * data = (uint8_t *) mapping->addr + weight->offs; @@ -5139,9 +5192,8 @@ struct llama_model_loader { })); } } else { -#if defined(GGML_USE_CUDA) - // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. - if (cuda_backend) { + // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + if (upload_backend) { file->seek(weight->offs, SEEK_SET); size_t bytes_read = 0; @@ -5151,17 +5203,14 @@ struct llama_model_loader { ggml_backend_event_synchronize(events[buffer_idx]); file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx]); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); bytes_read += read_iteration; ++buffer_idx; buffer_idx %= n_buffers; } - } - else -#endif - { + } else { read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); @@ -5176,17 +5225,15 @@ struct llama_model_loader { size_done += n_size; } -#if defined(GGML_USE_CUDA) - // free temporary resources used for async cuda uploads - if (cuda_backend) { - for (size_t idx = 0; idx < n_buffers;++idx) { - ggml_backend_event_synchronize(events[idx]); - ggml_backend_event_free(events[idx]); - ggml_backend_buffer_free(host_buffers[idx]); - } - ggml_backend_free(cuda_backend); + // free temporary resources used for async uploads + for (auto * event : events) { + ggml_backend_event_synchronize(event); + ggml_backend_event_free(event); } -#endif + for (auto * buf : host_buffers) { + ggml_backend_buffer_free(buf); + } + ggml_backend_free(upload_backend); // check validation results bool validation_failed = false; @@ -6922,6 +6969,13 @@ static bool llm_load_tensors( void * progress_callback_user_data) { auto & hparams = model.hparams; + // check if the value of main_gpu is valid + if (llama_get_device_count(model) > 0 && + split_mode != LLAMA_SPLIT_MODE_LAYER && + (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) { + throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model))); + } + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; @@ -6931,14 +6985,14 @@ static bool llm_load_tensors( bool use_mmap_buffer = true; // there is very little benefit to offloading the input layer, so always keep it on the CPU - model.buft_input = llama_default_buffer_type_cpu(true); + model.buft_input = llama_default_buffer_type_cpu(model, true); //model.buft_input = llama_default_buffer_type_offload(main_gpu); model.buft_layer.resize(n_layer); // assign cpu layers for (int i = 0; i < i_gpu_start; ++i) { - model.buft_layer[i] = llama_default_buffer_type_cpu(true); + model.buft_layer[i] = llama_default_buffer_type_cpu(model, true); } if (split_mode == LLAMA_SPLIT_MODE_LAYER) { @@ -6976,7 +7030,7 @@ static bool llm_load_tensors( int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); model.buft_output = llama_default_buffer_type_offload(model, layer_gpu); } else { - model.buft_output = llama_default_buffer_type_cpu(true); + model.buft_output = llama_default_buffer_type_cpu(model, true); } } else { ggml_backend_buffer_type_t split_buft; @@ -7000,7 +7054,7 @@ static bool llm_load_tensors( llama_default_buffer_type_offload(model, main_gpu) }; } else { - model.buft_output = llama_default_buffer_type_cpu(true); + model.buft_output = llama_default_buffer_type_cpu(model, true); } } @@ -8872,7 +8926,7 @@ static bool llm_load_tensors( // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size - if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { + if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { void * addr = nullptr; size_t first, last; @@ -8886,13 +8940,6 @@ static bool llm_load_tensors( } model.bufs.push_back(buf); bufs.emplace(idx, buf); -#ifdef GGML_USE_CUDA - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( - ggml_backend_buffer_get_base(buf), - ggml_backend_buffer_get_size(buf)); - } -#endif } } #ifdef GGML_USE_METAL @@ -16956,7 +17003,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { lctx.embd = nullptr; } - lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size); + lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size); if (lctx.buf_output == nullptr) { LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); return 0; @@ -18987,21 +19034,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { } size_t llama_max_devices(void) { -#if defined(GGML_USE_RPC) - return GGML_RPC_MAX_SERVERS; -#elif defined(GGML_USE_METAL) - return 1; -#elif defined(GGML_USE_CUDA) - return GGML_CUDA_MAX_DEVICES; -#elif defined(GGML_USE_SYCL) - return GGML_SYCL_MAX_DEVICES; -#elif defined(GGML_USE_VULKAN) - return GGML_VK_MAX_DEVICES; -#elif defined(GGML_USE_CANN) - return GGML_CANN_MAX_DEVICES; -#else - return 1; -#endif + return 16; } bool llama_supports_mmap(void) { @@ -19013,12 +19046,13 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { -#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ +#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else - return false; + return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || + ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr; #endif } @@ -19083,17 +19117,30 @@ struct llama_model * llama_load_model_from_file( return true; }; } + if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { // split the servers set them into model->rpc_servers std::string servers(params.rpc_servers); size_t pos = 0; - while ((pos = servers.find(",")) != std::string::npos) { + while ((pos = servers.find(',')) != std::string::npos) { std::string server = servers.substr(0, pos); model->rpc_servers.push_back(server); servers.erase(0, pos + 1); } model->rpc_servers.push_back(servers); } + + // create list of devices to use with this model + // currently, we use all available devices + // TODO: rework API to give user more control over device selection + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + // skip the CPU backend since it is handled separately + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) { + model->devices.push_back(dev); + } + } + int status = llama_model_load(path_model, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { @@ -19255,6 +19302,36 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends + int main_gpu = model->main_gpu; + + // with registry + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) { + ggml_backend_dev_t main_dev = model->devices[main_gpu]; + ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev)); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + for (auto * dev : model->devices) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } + if (main_gpu >= (int)model->devices.size()) { + main_gpu -= (int)model->devices.size(); + } + #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { for (const auto & endpoint : model->rpc_servers) { @@ -19267,6 +19344,9 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } } + if (main_gpu >= (int)model->rpc_servers.size()) { + main_gpu -= (int)model->rpc_servers.size(); + } #endif #if defined(GGML_USE_METAL) @@ -19279,28 +19359,6 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUDA) - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU - for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cuda_init(device); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } - } #elif defined(GGML_USE_VULKAN) if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); @@ -19308,7 +19366,7 @@ struct llama_context * llama_new_context_with_model( return nullptr; } if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { - ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu); + ggml_backend_t backend = ggml_backend_vk_init(main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); llama_free(ctx); @@ -19329,9 +19387,9 @@ struct llama_context * llama_new_context_with_model( #elif defined(GGML_USE_SYCL) // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + ggml_backend_t backend = ggml_backend_sycl_init(main_gpu); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu); llama_free(ctx); return nullptr; } @@ -19350,7 +19408,7 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_KOMPUTE) if (model->n_gpu_layers > 0) { - auto * backend = ggml_backend_kompute_init(model->main_gpu); + auto * backend = ggml_backend_kompute_init(main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); llama_free(ctx); @@ -19359,29 +19417,29 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } #elif defined(GGML_USE_CANN) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - // TODO: ggml_backend_cann is not support split tensor now, just leave code here. - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU - // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version. - for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cann_init(device); + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // TODO: ggml_backend_cann is not support split tensor now, just leave code here. + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_backend_t backend = ggml_backend_cann_init(main_gpu); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device); + LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version. + for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cann_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } - } #endif #ifdef GGML_USE_BLAS @@ -19446,7 +19504,7 @@ struct llama_context * llama_new_context_with_model( for (auto * backend : ctx->backends) { if (ggml_backend_is_cpu(backend)) { // use host buffers for the CPU backend compute buffer - backend_buft.push_back(llama_default_buffer_type_cpu(true)); + backend_buft.push_back(llama_default_buffer_type_cpu(*model, true)); } else { backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); } @@ -19457,17 +19515,37 @@ struct llama_context * llama_new_context_with_model( // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + // TODO: move these checks to ggml_backend_sched // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = llama_get_device_count(*model) > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER && params.offload_kqv; -#ifndef GGML_USE_CUDA - // pipeline parallelism requires support for async compute and events - // currently this is only implemented in the CUDA backend - pipeline_parallel = false; -#endif + + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto * backend : ctx->backends) { + if (ggml_backend_is_cpu(backend)) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend); + if (!dev) { + // backend is using old interface, not supported + pipeline_parallel = false; + break; + } + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } + } + } + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); if (pipeline_parallel) { @@ -21774,10 +21852,11 @@ const std::vector> & llama_internal void llama_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; + + ggml_backend_set_log_callback(log_callback, user_data); + #ifdef GGML_USE_METAL ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); -#elif defined(GGML_USE_CUDA) - ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #elif defined(GGML_USE_CANN) ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 95d983aa0..86a0b379b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -672,14 +672,11 @@ struct test_case { } // run - ggml_backend_synchronize(backend); - int64_t total_time_us = 0; int total_runs = 0; do { int64_t start_time = ggml_time_us(); ggml_backend_graph_compute(backend, gf); - ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); total_time_us += end_time - start_time; @@ -3723,20 +3720,22 @@ int main(int argc, char ** argv) { } // enumerate backends - printf("Testing %zu backends\n\n", ggml_backend_reg_get_count()); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; - for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) { - printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i)); + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) { + printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev)); + + if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) { printf(" Skipping\n"); n_ok++; continue; } - ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL); + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); GGML_ASSERT(backend != NULL); if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { @@ -3751,7 +3750,11 @@ int main(int argc, char ** argv) { ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2); } - printf(" Backend name: %s\n", ggml_backend_name(backend)); + printf(" Device description: %s\n", ggml_backend_dev_description(dev)); + size_t free, total; // NOLINT + ggml_backend_dev_memory(dev, &free, &total); + printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024); + printf("\n"); bool ok = test_backend(backend, mode, op_name_filter); @@ -3768,9 +3771,9 @@ int main(int argc, char ** argv) { ggml_backend_free(backend); } - printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count()); + printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count()); - if (n_ok != ggml_backend_reg_get_count()) { + if (n_ok != ggml_backend_dev_count()) { printf("\033[1;31mFAIL\033[0m\n"); return 1; } From 5639971466ed74386a1811938022f0c333007b55 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Thu, 3 Oct 2024 07:50:44 +0100 Subject: [PATCH 31/40] Fixed dequant precision issues in Q4_1 and Q5_1 (#9711) --- ggml/src/ggml-sycl/dequantize.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index 8f4041fff..b8304c3a2 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -55,12 +55,12 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, #ifdef GGML_SYCL_F16 // v = v * {d, d}; // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); #else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); #endif // GGML_SYCL_F16 } @@ -110,11 +110,11 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib, #ifdef GGML_SYCL_F16 // v = v * {d, d}; // v = v + {m, m}; - v.s0() = (v.s0() * d) + m; - v.s1() = (v.s1() * d) + m; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); #else - v.x() = (v.x() * d) + m; - v.y() = (v.y() * d) + m; + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); #endif // GGML_SYCL_F16 } From 841713e1e487bdb82fd106a52ad998c5f87b59e9 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Thu, 3 Oct 2024 13:00:52 +0300 Subject: [PATCH 32/40] rpc : enable vulkan (#9714) closes #8536 --- examples/rpc/rpc-server.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 6342e6488..355125831 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -6,6 +6,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -79,6 +83,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } +#elif GGML_USE_VULKAN + fprintf(stderr, "%s: using Vulkan backend\n", __func__); + backend = ggml_backend_vk_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend @@ -92,6 +102,8 @@ static ggml_backend_t create_backend() { static void get_backend_memory(size_t * free_mem, size_t * total_mem) { #ifdef GGML_USE_CUDA ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); +#elif GGML_USE_VULKAN + ggml_backend_vk_get_device_memory(0, free_mem, total_mem); #else #ifdef _WIN32 MEMORYSTATUSEX status; From e3c355ba654d4164c1c09e5d0dcacecb8b214af8 Mon Sep 17 00:00:00 2001 From: compilade Date: Thu, 3 Oct 2024 10:22:15 -0400 Subject: [PATCH 33/40] convert : handle tokenizer merges format from transformers 4.45 (#9696) --- gguf-py/gguf/vocab.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index dc5749913..f2645f921 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -122,8 +122,30 @@ class SpecialVocab: tokenizer = json.load(f) if self.load_merges: merges = tokenizer.get('model', {}).get('merges') - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges + if isinstance(merges, list) and merges: + if isinstance(merges[0], str): + self.merges = merges + elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): + # New format since transformers 4.45 to support spaces in merges + # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # TODO: internally store as the new format instead of converting to old + if any(' ' in s for pair in merges for s in pair): + logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') + self.merges = [ + ' '.join( + [ + # ensure the spaces are properly encoded + ''.join( + chr(ord(c) + 256) if c == ' ' else c + for c in part + ) + for part in pair + ] + ) + for pair in merges + ] + else: + raise ValueError("Unknown tokenizer merges format") added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} From d6fe7abf04e8ec5240dead6e2773ed1b7e7495d3 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Thu, 3 Oct 2024 12:39:03 -0300 Subject: [PATCH 34/40] ggml: unify backend logging mechanism (#9709) * Add scaffolding for ggml logging macros * Metal backend now uses GGML logging * Cuda backend now uses GGML logging * Cann backend now uses GGML logging * Add enum tag to parameters * Use C memory allocation funcs * Fix compile error * Use GGML_LOG instead of GGML_PRINT * Rename llama_state to llama_logger_state * Prevent null format string * Fix whitespace * Remove log callbacks from ggml backends * Remove cuda log statement --- ggml/include/ggml-backend.h | 5 +- ggml/include/ggml-cann.h | 11 --- ggml/include/ggml-cuda.h | 2 - ggml/include/ggml-metal.h | 2 - ggml/include/ggml.h | 4 + ggml/src/ggml-backend-impl.h | 3 - ggml/src/ggml-backend.cpp | 14 --- ggml/src/ggml-cann.cpp | 89 +++--------------- ggml/src/ggml-cuda.cu | 102 ++++++--------------- ggml/src/ggml-impl.h | 15 +++ ggml/src/ggml-metal.m | 172 +++++++++++++---------------------- ggml/src/ggml.c | 92 ++++++++++++++----- src/llama.cpp | 26 ++---- 13 files changed, 197 insertions(+), 340 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index b096aaed6..864bcbded 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -164,7 +164,7 @@ extern "C" { GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); - GGML_API void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data); + // Functions that may be obtained using ggml_backend_reg_get_proc_address typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *); @@ -184,9 +184,6 @@ extern "C" { GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); - // Set the log callback for all registered backends - GGML_API void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data); - // Direct backend (stream) initialization // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index ba9ff2292..95bdaf10d 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -116,17 +116,6 @@ GGML_API void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, size_t* total); -/** - * @brief Set the logging callback for GGML. - * - * This function sets the logging callback and user data for logging. - * - * @param log_callback The logging callback to set. - * @param user_data User data to pass to the logging callback. - */ -GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, - void* user_data); - #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index a8feddc94..f44d8f4e6 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -40,8 +40,6 @@ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, siz GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); -GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); - GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void); #ifdef __cplusplus diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index 55e6ecd84..c3ec572b2 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -39,8 +39,6 @@ extern "C" { // user-code should use only these functions // -GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); - GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 969be3e94..1b4006b62 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2167,6 +2167,10 @@ extern "C" { typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + // optimization parameters // // see ggml.c (ggml_opt_default_params) for default values diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 470c922fe..ba2e26999 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -215,9 +215,6 @@ extern "C" { // (optional) get a pointer to a function in the backend // backends can add custom functions that are not part of the standard ggml-backend interface void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name); - - // (optional) set the log callback for the backend - void (*set_log_callback)(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data); }; struct ggml_backend_reg { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 73a2b24f8..3300ddb52 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -505,12 +505,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na return reg->iface.get_proc_address(reg, name); } -void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) { - if (reg->iface.set_log_callback) { - reg->iface.set_log_callback(reg, log_callback, user_data); - } -} - // Backend registry #ifdef GGML_USE_CUDA @@ -614,13 +608,6 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { return NULL; } -void ggml_backend_set_log_callback(ggml_log_callback log_callback, void * user_data) { - for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - ggml_backend_reg_t reg = ggml_backend_reg_get(i); - ggml_backend_reg_set_log_callback(reg, log_callback, user_data); - } -} - // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); @@ -1161,7 +1148,6 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, /* .get_device = */ ggml_backend_cpu_reg_get_device, /* .get_proc_address = */ NULL, - /* .set_log_callback = */ NULL, }; ggml_backend_reg_t ggml_backend_cpu_reg(void) { diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 63ad0b878..db5f8f186 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -39,69 +39,6 @@ #include "ggml-common.h" -/** - * @brief Default logging callback for GGML. - * - * This function is the default logging callback that logs messages to stderr. - * - * @param level The log level. - * @param msg The log message. - * @param user_data User data passed to the callback. - */ -static void ggml_cann_default_log_callback(enum ggml_log_level level, - const char* msg, void* user_data) { - GGML_UNUSED(level); - GGML_UNUSED(user_data); - fprintf(stderr, "%s", msg); -} - -ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback; -void* ggml_cann_log_user_data = NULL; - -GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, - void* user_data) { - ggml_cann_log_callback = log_callback; - ggml_cann_log_user_data = user_data; -} - -#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_CANN_LOG_ERROR(...) \ - ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - -GGML_ATTRIBUTE_FORMAT(2, 3) - -/** - * @brief Log a message using the current logging callback. - * - * This function formats a log message and passes it to the current logging - * callback. - * - * @param level The log level. - * @param format The format string for the log message. - * @param ... The arguments for the format string. - */ -static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { - if (ggml_cann_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data); - } else { - // vsnprintf adds a null terminator - std::vector buffer2(len + 1); - va_end(args); - va_start(args, format); - vsnprintf(&buffer2[0], buffer2.size(), format, args); - ggml_cann_log_callback(level, buffer2.data(), - ggml_cann_log_user_data); - } - va_end(args); - } -} - /** * @brief Handles CANN errors by printing an error message and aborting. * @@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { int32_t id = -1; aclrtGetDevice(&id); - GGML_CANN_LOG_ERROR("CANN error: %s\n", msg); - GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, + GGML_LOG_ERROR("CANN error: %s\n", msg); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); - GGML_CANN_LOG_ERROR(" %s\n", stmt); + GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace GGML_ABORT("CANN error"); } @@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() { aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n", + GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg()); return info; } @@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO( + GGML_LOG_INFO( "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " "requested %u MB\n", __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), @@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // add to the pool pool_size += reserve_size; - // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB ( + // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( // reserved %llu MB)\n", // device, (unsigned long long) (pool_size/1024/1024), // (unsigned long long) (reserve_size/1024/1024)); @@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { pool_used += size; #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, + GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long)size, (unsigned long long)ptr); #endif return ptr; @@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ void free(void* ptr, size_t size) override { #ifdef DEBUG_CANN_MALLOC - GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, + GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long)size, (unsigned long long)ptr); #endif @@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, void* dev_ptr; aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_ERROR( + GGML_LOG_ERROR( "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg()); @@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) { aclError err = aclrtMallocHost((void **) &hostPtr, size); if (err != ACL_SUCCESS) { - GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, aclGetRecentErrMsg()); return nullptr; } @@ -1733,7 +1670,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( bool ok = ggml_cann_compute_forward(*cann_ctx, node); if (!ok) { - GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, + GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); @@ -2043,13 +1980,13 @@ static ggml_guid_t ggml_backend_cann_guid() { ggml_backend_t ggml_backend_cann_init(int32_t device) { aclInit(nullptr); if (device < 0 || device >= ggml_backend_cann_get_device_count()) { - GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); + GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); return nullptr; } ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); if (ctx == nullptr) { - GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } ggml_cann_set_device(ctx->device); diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 43151e235..663e97cd7 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -56,52 +56,14 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); -static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - GGML_UNUSED(level); - GGML_UNUSED(user_data); - fprintf(stderr, "%s", msg); -} - -ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback; -void * ggml_cuda_log_user_data = NULL; - -GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_cuda_log_callback = log_callback; - ggml_cuda_log_user_data = user_data; -} - -#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - -GGML_ATTRIBUTE_FORMAT(2, 3) -static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) { - if (ggml_cuda_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data); - } else { - std::vector buffer2(len + 1); // vsnprintf adds a null terminator - va_end(args); - va_start(args, format); - vsnprintf(&buffer2[0], buffer2.size(), format, args); - ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data); - } - va_end(args); - } -} - [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); - GGML_CUDA_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); - GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); - GGML_CUDA_LOG_ERROR(" %s\n", stmt); + GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); + GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ABORT to get a stack trace GGML_ABORT(GGML_CUDA_NAME " error"); } @@ -166,7 +128,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaError_t err = cudaGetDeviceCount(&info.device_count); if (err != cudaSuccess) { - GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); + GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); return info; } @@ -174,16 +136,16 @@ static ggml_cuda_device_info ggml_cuda_init() { int64_t total_vram = 0; #ifdef GGML_CUDA_FORCE_MMQ - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); #else - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); #endif // GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_CUBLAS - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__); #else - GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); + GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); #endif // GGML_CUDA_FORCE_CUBLAS - GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); + GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; @@ -204,7 +166,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -312,7 +274,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CUDA_MALLOC - GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, + GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024)); #endif return ptr; @@ -327,7 +289,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - GGML_CUDA_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -591,7 +553,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); + GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); return nullptr; } @@ -1016,7 +978,7 @@ static void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return nullptr; } @@ -2283,7 +2245,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { - GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); + GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); return false; } else { ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); @@ -2367,7 +2329,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); + GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); CUDA_CHECK(err); } @@ -2436,7 +2398,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); + GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); #endif return false; } @@ -2552,7 +2514,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } } @@ -2603,14 +2565,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif } if (node->op == GGML_OP_MUL_MAT_ID) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); #endif } @@ -2619,7 +2581,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // Changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } @@ -2631,7 +2593,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (!ptr) { use_cuda_graph = false; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); #endif } else { if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { @@ -2655,7 +2617,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif } } @@ -2694,7 +2656,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { - GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); } @@ -2713,7 +2675,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, use_cuda_graph = false; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; #ifndef NDEBUG - GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); + GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); #endif } else { graph_evaluated_or_captured = true; // CUDA graph has been captured @@ -2780,7 +2742,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG - GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); + GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__); #endif // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate @@ -2882,7 +2844,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { // clear the error cudaGetLastError(); - GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return false; } @@ -3305,17 +3267,11 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con return nullptr; } -static void ggml_backend_cuda_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback log_callback, void * user_data) { - GGML_UNUSED(reg); - ggml_backend_cuda_log_set_callback(log_callback, user_data); -} - static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { /* .get_name = */ ggml_backend_cuda_reg_get_name, /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, /* .get_device_get = */ ggml_backend_cuda_reg_get_device, /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, - /* .set_log_callback = */ ggml_backend_cuda_reg_set_log_callback, }; // backend registry @@ -3361,13 +3317,13 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { ggml_backend_t ggml_backend_cuda_init(int device) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { - GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); + GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; } ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); if (ctx == nullptr) { - GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: failed to allocate context\n", __func__); return nullptr; } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 833984190..d3f4bad8c 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -33,6 +33,21 @@ extern "C" { #endif #endif +// +// logging +// + +GGML_ATTRIBUTE_FORMAT(2, 3) +void ggml_log_internal (enum ggml_log_level level, const char * format, ...); +void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data); + +#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) +#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) +#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) + // bitset typedef uint32_t ggml_bitset_t; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 8ff16983e..7ffaaf8d8 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -18,19 +18,6 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 -#ifdef GGML_METAL_NDEBUG -#define GGML_METAL_LOG(...) -#define GGML_METAL_LOG_INFO(...) -#define GGML_METAL_LOG_WARN(...) -#define GGML_METAL_LOG_ERROR(...) -#else -#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__) -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) -#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) -#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) -#endif - #define UNUSED(x) (void)(x) struct ggml_metal_kernel { @@ -277,51 +264,19 @@ struct ggml_backend_metal_context { @implementation GGMLMetalClass @end -static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); -} - -ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; -void * ggml_metal_log_user_data = NULL; - -GGML_ATTRIBUTE_FORMAT(2, 3) -static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ - if (ggml_metal_log_callback != NULL) { - va_list args; - va_start(args, format); - char buffer[128]; - int len = vsnprintf(buffer, 128, format, args); - if (len < 128) { - ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); - } else { - char* buffer2 = malloc(len+1); - va_end(args); - va_start(args, format); - vsnprintf(buffer2, len+1, format, args); - buffer2[len] = 0; - ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); - free(buffer2); - } - va_end(args); - } -} - static void * ggml_metal_host_malloc(size_t n) { void * data = NULL; #if TARGET_OS_OSX kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE); if (err != KERN_SUCCESS) { - GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); + GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); return NULL; } #else const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); + GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } #endif @@ -330,20 +285,20 @@ static void * ggml_metal_host_malloc(size_t n) { } static struct ggml_backend_metal_context * ggml_metal_init(void) { - GGML_METAL_LOG_INFO("%s: allocating\n", __func__); + GGML_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG // Show all the Metal device instances in the system NSArray * devices = MTLCopyAllDevices(); for (id device in devices) { - GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); + GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]); } [devices release]; // since it was created by a *Copy* C method #endif // Pick and show default Metal device id device = MTLCreateSystemDefaultDevice(); - GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); @@ -381,28 +336,28 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { if (try_metallib && path_lib != nil) { // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:path_lib]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } else { #if GGML_METAL_EMBED_LIBRARY - GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__); + GGML_LOG_INFO("%s: using embedded metal library\n", __func__); extern const char ggml_metallib_start[]; extern const char ggml_metallib_end[]; NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; #else - GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); + GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * path_source; NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); + GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); if (path_resource) { path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; @@ -411,15 +366,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { } if (path_source == nil) { - GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); + GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); path_source = @"ggml-metal.metal"; } - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } #endif // GGML_METAL_EMBED_LIBRARY @@ -435,7 +390,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { metal_library = [ctx->device newLibraryWithSource:src options:options error:&error]; if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -443,7 +398,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { } // print MTL GPU family: - GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + GGML_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); const NSInteger MTLGPUFamilyMetal3 = 5001; @@ -453,21 +408,21 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { { for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); break; } } for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) { if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); break; } } for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) { if ([ctx->device supportsFamily:i]) { - GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i); + GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i); break; } } @@ -478,9 +433,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7]; - GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); - GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; ctx->capture_started = false; @@ -494,13 +449,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); + GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); } #elif TARGET_OS_OSX if (ctx->device.maxTransferRate != 0) { - GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); + GGML_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); } else { - GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); + GGML_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -513,7 +468,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { } /* - GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.threadExecutionWidth); \ */ @@ -524,12 +479,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \ [metal_function release]; \ if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ [metal_library release]; \ return NULL; \ } \ } else { \ - GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ + GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } // simd_sum and simd_max requires MTLGPUFamilyApple7 @@ -726,7 +681,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { } static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { - GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); + GGML_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { [ctx->kernels[i].pipeline release]; @@ -764,7 +719,7 @@ struct ggml_backend_metal_buffer_context { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { - //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -776,17 +731,17 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs for (int i = 0; i < buf_ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { *offs = (size_t) ioffs; - //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); return buf_ctx->buffers[i].metal; } } - GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); + GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); return nil; } @@ -918,7 +873,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * node = ggml_graph_node(gf, idx); - //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op)); + //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op)); struct ggml_tensor * src0 = node->src[0]; struct ggml_tensor * src1 = node->src[1]; @@ -944,7 +899,7 @@ static void ggml_metal_encode_node( } if (!ggml_metal_supports_op(ctx, dst)) { - GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); + GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); GGML_ABORT("unsupported op"); } @@ -1002,17 +957,17 @@ static void ggml_metal_encode_node( id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; - //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -1404,7 +1359,7 @@ static void ggml_metal_encode_node( } break; default: { - GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); GGML_ABORT("fatal error"); } } break; @@ -1956,7 +1911,7 @@ static void ggml_metal_encode_node( } break; default: { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_ABORT("not implemented"); } }; @@ -2252,7 +2207,7 @@ static void ggml_metal_encode_node( } break; default: { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); + GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t); GGML_ABORT("not implemented"); } }; @@ -2821,8 +2776,8 @@ static void ggml_metal_encode_node( //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; default: { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); GGML_ABORT("add template specialization for this size"); } } @@ -2834,8 +2789,8 @@ static void ggml_metal_encode_node( //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; default: { - GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); GGML_ABORT("add template specialization for this size"); } } @@ -2996,7 +2951,7 @@ static void ggml_metal_encode_node( } break; default: { - GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); + GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); GGML_ABORT("fatal error"); } } @@ -3041,7 +2996,7 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { - GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); + GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); GGML_ABORT("capture failed"); } else { [ctx->capture_scope beginScope]; @@ -3123,9 +3078,9 @@ static enum ggml_status ggml_metal_graph_compute( MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { - GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -3138,9 +3093,9 @@ static enum ggml_status ggml_metal_graph_compute( MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { - GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -3157,7 +3112,7 @@ static enum ggml_status ggml_metal_graph_compute( } if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); return GGML_STATUS_ABORTED; } @@ -3286,17 +3241,17 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", + GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { - GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); } } else { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", + GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0); @@ -3338,7 +3293,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba } if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); return NULL; @@ -3423,7 +3378,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); return false; } } @@ -3449,7 +3404,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); return false; } } @@ -3457,7 +3412,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ggml_backend_metal_log_allocated_size(device, size_step_aligned); if (i + size_step < size) { - GGML_METAL_LOG_INFO("\n"); + GGML_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -3514,7 +3469,7 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); if (ctx->n_cb > 2) { - GGML_METAL_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb); + GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb); } } @@ -3544,11 +3499,6 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .event_wait = */ NULL, }; -void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_metal_log_callback = log_callback; - ggml_metal_log_user_data = user_data; -} - static ggml_guid_t ggml_backend_metal_guid(void) { static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 }; return &guid; @@ -3557,7 +3507,7 @@ static ggml_guid_t ggml_backend_metal_guid(void) { ggml_backend_t ggml_backend_metal_init(void) { struct ggml_backend_metal_context * ctx = ggml_metal_init(); if (ctx == NULL) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e740e58b2..de500a675 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -319,26 +319,63 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { // logging // +struct ggml_logger_state { + ggml_log_callback log_callback; + void * log_callback_user_data; +}; +static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL}; + +static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) { + if (format == NULL) + return; + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); + } else { + char * buffer2 = (char *) calloc(len + 1, sizeof(char)); + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); + free(buffer2); + } + va_end(args_copy); +} + +void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + ggml_log_internal_v(level, format, args); + va_end(args); +} + +void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + #if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG(...) #endif #if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG_5(...) #endif #if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__) #else #define GGML_PRINT_DEBUG_10(...) #endif -#define GGML_PRINT(...) printf(__VA_ARGS__) - // // end of logging block // @@ -355,7 +392,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { #else inline static void * ggml_aligned_malloc(size_t size) { if (size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); return NULL; } void * aligned_memory = NULL; @@ -377,7 +414,7 @@ inline static void * ggml_aligned_malloc(size_t size) { error_desc = "insufficient memory"; break; } - GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); return NULL; } @@ -393,12 +430,12 @@ inline static void * ggml_aligned_malloc(size_t size) { inline static void * ggml_malloc(size_t size) { if (size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n"); return NULL; } void * result = malloc(size); if (result == NULL) { - GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); } return result; @@ -407,12 +444,12 @@ inline static void * ggml_malloc(size_t size) { // calloc inline static void * ggml_calloc(size_t num, size_t size) { if (num == 0 || size == 0) { - GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n"); + GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n"); return NULL; } void * result = calloc(num, size); if (result == NULL) { - GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); GGML_ABORT("fatal error"); } return result; @@ -3347,7 +3384,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { if (fptr != NULL) { char buf[42]; if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { - GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); } fclose(fptr); } @@ -3365,21 +3402,21 @@ bool ggml_is_numa(void) { //////////////////////////////////////////////////////////////////////////////// void ggml_print_object(const struct ggml_object * obj) { - GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", + GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", obj->type, obj->offs, obj->size, (const void *) obj->next); } void ggml_print_objects(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; - GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx); while (obj != NULL) { ggml_print_object(obj); obj = obj->next; } - GGML_PRINT("%s: --- end ---\n", __func__); + GGML_LOG_INFO("%s: --- end ---\n", __func__); } int64_t ggml_nelements(const struct ggml_tensor * tensor) { @@ -3962,7 +3999,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; @@ -4026,7 +4063,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( if (ctx->scratch.data != NULL) { // allocate tensor data in the scratch buffer if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", __func__, ctx->scratch.offs + data_size, ctx->scratch.size); assert(false); return NULL; @@ -20013,7 +20050,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } #else if (n_threads > threadpool->n_threads_max) { - GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); + GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); n_threads = threadpool->n_threads_max; } @@ -20552,30 +20589,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } void ggml_graph_print(const struct ggml_cgraph * cgraph) { - GGML_PRINT("=== GRAPH ===\n"); + GGML_LOG_INFO("=== GRAPH ===\n"); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n", + GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " "); } - GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs); for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", + GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", i, node->ne[0], node->ne[1], ggml_op_name(node->op), ggml_get_name(node)); } - GGML_PRINT("========================================\n"); + GGML_LOG_INFO("========================================\n"); } // check if node is part of the graph @@ -20746,7 +20783,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fclose(fp); - GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); + GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); } //////////////////////////////////////////////////////////////////////////////// @@ -23241,4 +23278,9 @@ int ggml_cpu_get_sve_cnt(void) { return 0; #endif } + +void ggml_log_set(ggml_log_callback log_callback, void * user_data) { + g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; + g_logger_state.log_callback_user_data = user_data; +} //////////////////////////////////////////////////////////////////////////////// diff --git a/src/llama.cpp b/src/llama.cpp index 69ba65395..3443b0689 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2266,17 +2266,12 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_ // globals // -struct llama_state { - llama_state() { - llama_log_set(log_callback, log_callback_user_data); - } - - // We save the log callback globally +struct llama_logger_state { ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; -static llama_state g_state; +static llama_logger_state g_logger_state; // available llama models enum e_model { @@ -21850,16 +21845,9 @@ const std::vector> & llama_internal } void llama_log_set(ggml_log_callback log_callback, void * user_data) { - g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; - g_state.log_callback_user_data = user_data; - - ggml_backend_set_log_callback(log_callback, user_data); - -#ifdef GGML_USE_METAL - ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); -#elif defined(GGML_USE_CANN) - ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); -#endif + ggml_log_set(log_callback, user_data); + g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; + g_logger_state.log_callback_user_data = user_data; } static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) { @@ -21868,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l char buffer[128]; int len = vsnprintf(buffer, 128, format, args); if (len < 128) { - g_state.log_callback(level, buffer, g_state.log_callback_user_data); + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); } else { char * buffer2 = new char[len + 1]; vsnprintf(buffer2, len + 1, format, args_copy); buffer2[len] = 0; - g_state.log_callback(level, buffer2, g_state.log_callback_user_data); + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); delete[] buffer2; } va_end(args_copy); From a7ad553513a5d70b4ceacd36f64705cf3654dc97 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 3 Oct 2024 17:39:18 +0200 Subject: [PATCH 35/40] ggml-backend : add device description to CPU backend (#9720) --- ggml/src/ggml-backend.cpp | 88 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3300ddb52..0551764fe 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1,5 +1,13 @@ // Note: porting this file to C++ is a work in progress +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" @@ -10,9 +18,15 @@ #include #include #include - +#include #include +#ifdef __APPLE__ +#include +#include +#endif + + // backend buffer type const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { @@ -1008,6 +1022,70 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) //////////////////////// +struct ggml_backend_cpu_device_context { + std::string description = "CPU"; + + ggml_backend_cpu_device_context() { +#ifdef __APPLE__ + size_t len = 0; + if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { + description.resize(len); + sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT + } +#elif defined(__linux__) + FILE * f = fopen("/proc/cpuinfo", "r"); + if (f) { + char buf[1024]; + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "model name", 10) == 0) { + char * p = strchr(buf, ':'); + if (p) { + p++; + while (std::isspace(*p)) { + p++; + } + while (std::isspace(p[strlen(p) - 1])) { + p[strlen(p) - 1] = '\0'; + } + description = p; + break; + } + } + } + fclose(f); + } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) == ERROR_SUCCESS) { + DWORD cpu_brand_size = 0; + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + NULL, + &cpu_brand_size) == ERROR_SUCCESS) { + description.resize(cpu_brand_size); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)&description[0], // NOLINT + &cpu_brand_size) == ERROR_SUCCESS) { + if (description.find('\0') != std::string::npos) { + description.resize(description.find('\0')); + } + } + } + RegCloseKey(hKey); + } +#endif + } +}; + static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { return "CPU"; @@ -1015,10 +1093,9 @@ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { } static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { - // TODO - return "CPU"; + struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; - GGML_UNUSED(dev); + return ctx->description.c_str(); } static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { @@ -1131,10 +1208,11 @@ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); + static ggml_backend_cpu_device_context ctx; static ggml_backend_device ggml_backend_cpu_device = { /* .iface = */ ggml_backend_cpu_device_i, /* .reg = */ reg, - /* .context = */ NULL, + /* .context = */ &ctx, }; return &ggml_backend_cpu_device; From 5d5ab1e5cca0d6d63701a1cb85dbe26cb57d2c4e Mon Sep 17 00:00:00 2001 From: Jack Mousseau Date: Thu, 3 Oct 2024 11:01:46 -0700 Subject: [PATCH 36/40] metal : fix compute pass descriptor autorelease crash (#9718) --- ggml/src/ggml-metal.m | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 7ffaaf8d8..d10f5af0b 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -217,8 +217,6 @@ struct ggml_backend_metal_context { id device; id queue; - MTLComputePassDescriptor * edesc; - dispatch_queue_t d_queue; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT]; @@ -304,8 +302,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(void) { struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); ctx->device = device; ctx->queue = [ctx->device newCommandQueue]; - ctx->edesc = MTLComputePassDescriptor.computePassDescriptor; - ctx->edesc.dispatchType = MTLDispatchTypeSerial; ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -3016,7 +3012,7 @@ static enum ggml_status ggml_metal_graph_compute( const int n_nodes_per_cb = ctx->n_nodes_per_cb; id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoderWithDescriptor: ctx->edesc]; + id encoder = [command_buffer computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; From eee39bdc96065b69242877fe8f1be05c885fc2aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 2 Oct 2024 15:32:39 +0200 Subject: [PATCH 37/40] ggml: refactor cross entropy loss CPU impl. (ggml/976) --- ggml/include/ggml-backend.h | 4 +-- ggml/src/ggml.c | 64 ++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 864bcbded..4d7d2716e 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -247,7 +247,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); @@ -262,7 +262,7 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index de500a675..6e034a087 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4232,9 +4232,13 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa } struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + if (ggml_is_empty(tensor)) { + return tensor; + } if (tensor->buffer) { ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); } else { + GGML_ASSERT(tensor->data); memset(tensor->data, 0, ggml_nbytes(tensor)); } return tensor; @@ -16851,41 +16855,40 @@ static void ggml_compute_forward_cross_entropy_loss_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); const int ith = params->ith; const int nth = params->nth; - float * sums = (float *) params->wdata; - - // TODO: handle transposed/permuted matrices - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + float * sums = (float *) params->wdata; + float * st = ((float *) params->wdata) + nth + ith*nc; + float sum_thread = 0.0f; GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (ith == 0) { - memset(sums, 0, sizeof(float) * (nth + nth * nc)); - } - ggml_barrier(params->threadpool); - // rows per thread - const int dr = (nr + nth - 1)/nth; + const int64_t dr = (nr + nth - 1)/nth; // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); - for (int i1 = ir0; i1 < ir1; i1++) { - float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); - float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * st = ((float *) params->wdata) + nth + ith*nc; + for (int64_t i1 = ir0; i1 < ir1; ++i1) { + const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); + const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(s0[i])); assert(!isnan(s1[i])); @@ -16894,23 +16897,24 @@ static void ggml_compute_forward_cross_entropy_loss_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - ggml_float sum = ggml_vec_log_soft_max_f32(nc, st, s0, max); - assert(sum >= 0.0); + const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max); + assert(sum_softmax >= 0.0); - ggml_vec_add1_f32(nc, st, st, -sum); + ggml_vec_add1_f32(nc, st, st, -sum_softmax); ggml_vec_mul_f32(nc, st, st, s1); - float st_sum = 0.0f; - ggml_vec_sum_f32(nc, &st_sum, st); - sums[ith] += st_sum; + float sum_st = 0.0f; + ggml_vec_sum_f32(nc, &sum_st, st); + sum_thread += sum_st; #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { assert(!isnan(st[i])); assert(!isinf(st[i])); } #endif } + sums[ith] = sum_thread; ggml_barrier(params->threadpool); if (ith == 0) { @@ -16976,7 +16980,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); assert(!isnan(s0[i])); assert(!isnan(s1[i])); @@ -16995,7 +16999,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( ggml_vec_scale_f32(nc, ds0, d_by_nr); #ifndef NDEBUG - for (int i = 0; i < nc; ++i) { + for (int64_t i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); assert(!isinf(ds0[i])); } From fabdc3bda396307565c4f3f4ecbc3a751a2eb6d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 3 Oct 2024 17:29:59 +0200 Subject: [PATCH 38/40] ggml/ex: calculate accuracy in graph, adapt MNIST (ggml/980) --- ggml/include/ggml.h | 7 ++ ggml/src/ggml-cuda.cu | 17 ++++ ggml/src/ggml-cuda/argmax.cu | 79 +++++++++++++++++ ggml/src/ggml-cuda/argmax.cuh | 3 + ggml/src/ggml-cuda/common.cuh | 12 +++ ggml/src/ggml-cuda/count-equal.cu | 64 ++++++++++++++ ggml/src/ggml-cuda/count-equal.cuh | 5 ++ ggml/src/ggml-cuda/fattn-tile-f16.cu | 2 +- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 6 +- ggml/src/ggml.c | 123 ++++++++++++++++++++++++++- tests/test-backend-ops.cpp | 79 ++++++++++++++++- 11 files changed, 389 insertions(+), 8 deletions(-) create mode 100644 ggml/src/ggml-cuda/argmax.cu create mode 100644 ggml/src/ggml-cuda/argmax.cuh create mode 100644 ggml/src/ggml-cuda/count-equal.cu create mode 100644 ggml/src/ggml-cuda/count-equal.cuh diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1b4006b62..e7678d071 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -456,6 +456,7 @@ extern "C" { GGML_OP_SUM_ROWS, GGML_OP_MEAN, GGML_OP_ARGMAX, + GGML_OP_COUNT_EQUAL, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, GGML_OP_CONCAT, @@ -994,6 +995,12 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // count number of equal elements in a and b + GGML_API struct ggml_tensor * ggml_count_equal( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b GGML_API struct ggml_tensor * ggml_repeat( diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 663e97cd7..bcb39766b 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -5,12 +5,14 @@ #include "ggml-cuda/common.cuh" #include "ggml-cuda/acc.cuh" #include "ggml-cuda/arange.cuh" +#include "ggml-cuda/argmax.cuh" #include "ggml-cuda/argsort.cuh" #include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/clamp.cuh" #include "ggml-cuda/concat.cuh" #include "ggml-cuda/conv-transpose-1d.cuh" #include "ggml-cuda/convert.cuh" +#include "ggml-cuda/count-equal.cuh" #include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cross-entropy-loss.cuh" #include "ggml-cuda/diagmask.cuh" @@ -2143,6 +2145,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg } switch (dst->op) { + case GGML_OP_ARGMAX: + ggml_cuda_argmax(ctx, dst); + break; + case GGML_OP_COUNT_EQUAL: + ggml_cuda_count_equal(ctx, dst); + break; case GGML_OP_REPEAT: ggml_cuda_op_repeat(ctx, dst); break; @@ -3073,6 +3081,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } break; case GGML_OP_DUP: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; + case GGML_OP_ARGMAX: + case GGML_OP_COUNT_EQUAL: + { + return true; + } break; case GGML_OP_REPEAT: { ggml_type src0_type = op->src[0]->type; diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu new file mode 100644 index 000000000..aab04eca7 --- /dev/null +++ b/ggml/src/ggml-cuda/argmax.cu @@ -0,0 +1,79 @@ +#include "common.cuh" +#include "argmax.cuh" +#include "sum.cuh" + +#include + +static __global__ void argmax_f32( + const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { + + int argmax_thread = 0; + const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; + +#pragma unroll + for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { + const int64_t row = row0 + row1; + + if (row >= nrows) { + break; + } + + float maxval = -FLT_MAX; + int argmax = -1; + + for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { + const float val = x[row*ncols + col]; + const int bigger = val > maxval; + const int not_bigger = bigger ^ 0x00000001; + + maxval = maxval*not_bigger + val*bigger; + argmax = argmax*not_bigger + col*bigger; + } + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); + const int bigger = val > maxval; + const int not_bigger = bigger ^ 0x00000001; + + maxval = maxval*not_bigger + val*bigger; + argmax = argmax*not_bigger + col*bigger; + } + + const int store = row1 == threadIdx.x; + argmax_thread += store*argmax; + } + + const int row = row0 + threadIdx.x; + + if (row >= nrows) { + return; + } + + dst[row] = argmax_thread; +} + +void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + const float * src0_d = (const float *) src0->data; + int32_t * dst_d = (int32_t *) dst->data; + + cudaStream_t stream = ctx.stream(); + + const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; + + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(num_blocks, 1, 1); + + argmax_f32<<>>(src0_d, dst_d, ne00, nrows); +} diff --git a/ggml/src/ggml-cuda/argmax.cuh b/ggml/src/ggml-cuda/argmax.cuh new file mode 100644 index 000000000..5b7223adc --- /dev/null +++ b/ggml/src/ggml-cuda/argmax.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 6a4bcdba0..dd203fcde 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -175,6 +175,18 @@ static __device__ void no_device_code( #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ +static __device__ __forceinline__ int warp_reduce_sum(int x) { +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE + return __reduce_add_sync(0xffffffff, x); +#else +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +} + static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu new file mode 100644 index 000000000..ffb053b10 --- /dev/null +++ b/ggml/src/ggml-cuda/count-equal.cu @@ -0,0 +1,64 @@ +#include "common.cuh" +#include "count-equal.cuh" + +#include + +template +static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) { + const int64_t i0 = (int64_t) blockIdx.x*dk; + const int64_t i1 = min(i0 + dk, k); + + int nequal = 0; + + for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) { + const T xi = x[i]; + const T yi = y[i]; + nequal += xi == yi; + } + + nequal = warp_reduce_sum(nequal); + + if (threadIdx.x != 0) { + return; + } + + atomicAdd((int *) dst, nequal); +} + +void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == src1->type); + GGML_ASSERT( dst->type == GGML_TYPE_I64); + + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + int64_t * dst_d = (int64_t *) dst->data; + + cudaStream_t stream = ctx.stream(); + const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int"); + const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE); + + CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream)); + + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1); + + switch (src0->type) { + case GGML_TYPE_I32: { + const int * src0_d = (const int *) src0->data; + const int * src1_d = (const int *) src1->data; + count_equal<<>>(src0_d, src1_d, dst_d, dne, ne); + } break; + default: + GGML_ASSERT(false); + break; + } +} diff --git a/ggml/src/ggml-cuda/count-equal.cuh b/ggml/src/ggml-cuda/count-equal.cuh new file mode 100644 index 000000000..8467da79e --- /dev/null +++ b/ggml/src/ggml-cuda/count-equal.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128 + +void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 342f2eb66..5af02c7ec 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -259,7 +259,7 @@ static __global__ void flash_attn_tile_ext_f16( } half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); - kqsum_j = warp_reduce_sum(kqsum_j); + kqsum_j = warp_reduce_sum((float)kqsum_j); #pragma unroll for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 448a9a905..2ed6509ac 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -196,7 +196,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j = 0; j < ncols; ++j) { half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); - sum = warp_reduce_sum(sum); + sum = warp_reduce_sum((float)sum); if (use_logit_softcap) { sum = logit_softcap*tanhf(sum); @@ -265,7 +265,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j = 0; j < ncols; ++j) { - kqsum[j] = warp_reduce_sum(kqsum[j]); + kqsum[j] = warp_reduce_sum((float)kqsum[j]); if (threadIdx.x == 0) { kqsum_shared[j][threadIdx.y] = kqsum[j]; } @@ -280,7 +280,7 @@ static __global__ void flash_attn_vec_ext_f16( } kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; - kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); + kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]); half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); if (parallel_blocks == 1) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6e034a087..03b832d0f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2994,6 +2994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "SUM_ROWS", "MEAN", "ARGMAX", + "COUNT_EQUAL", "REPEAT", "REPEAT_BACK", "CONCAT", @@ -3067,7 +3068,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3088,6 +3089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx_k", "Σx/n", "argmax(x)", + "count_equal(x)", "repeat(x)", "repeat_back(x)", "concat(x, y)", @@ -3161,7 +3163,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); +static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5222,6 +5224,23 @@ struct ggml_tensor * ggml_argmax( return result; } +// ggml_count_equal + +struct ggml_tensor * ggml_count_equal( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1); + + result->op = GGML_OP_COUNT_EQUAL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_repeat struct ggml_tensor * ggml_repeat( @@ -10809,6 +10828,86 @@ static void ggml_compute_forward_argmax( } } +// ggml_compute_forward_count_equal + +static void ggml_compute_forward_count_equal_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_I32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_I64); + + const int64_t nr = ggml_nrows(src0); + + const int ith = params->ith; + const int nth = params->nth; + + int64_t * sums = (int64_t *) params->wdata; + int64_t sum_thread = 0; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir / (ne02*ne01); + const int64_t i02 = (ir - i03*ne03) / ne01; + const int64_t i01 = ir - i03*ne03 - i02*ne02; + + const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; + const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; + + for (int64_t i00 = 0; i00 < ne00; ++i00) { + const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); + const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); + + sum_thread += val0 == val1; + } + } + if (ith != 0) { + sums[ith] = sum_thread; + } + ggml_barrier(params->threadpool); + + if (ith != 0) { + return; + } + + for (int ith_other = 1; ith_other < nth; ++ith_other) { + sum_thread += sums[ith_other]; + } + *((int64_t *) dst->data) = sum_thread; +} + +static void ggml_compute_forward_count_equal( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_I32: + { + ggml_compute_forward_count_equal_i32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_repeat static void ggml_compute_forward_repeat_f32( @@ -17187,6 +17286,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_argmax(params, tensor); } break; + case GGML_OP_COUNT_EQUAL: + { + ggml_compute_forward_count_equal(params, tensor); + } break; case GGML_OP_REPEAT: { ggml_compute_forward_repeat(params, tensor); @@ -17937,6 +18040,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MEAN: case GGML_OP_ARGMAX: + case GGML_OP_COUNT_EQUAL: { GGML_ABORT("fatal error"); // TODO: implement } @@ -18710,6 +18814,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * for (int i = 0; i < gf->n_nodes; ++i) { struct ggml_tensor * node = gf->nodes[i]; + if (node->type == GGML_TYPE_I32) { + continue; + } + bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; bool ignore_src[GGML_MAX_SRC] = {false}; switch (node->op) { @@ -19113,6 +19221,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_ARGMAX: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT_EQUAL: + { + n_tasks = n_threads; + } break; case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_LEAKY_RELU: @@ -19611,6 +19726,10 @@ struct ggml_cplan ggml_graph_plan( cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } } break; + case GGML_OP_COUNT_EQUAL: + { + cur = ggml_type_size(node->type)*n_tasks; + } break; case GGML_OP_MUL_MAT: { const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 86a0b379b..a10d98e35 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -116,6 +116,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + } else if (tensor->type == GGML_TYPE_I64) { + // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful. + const size_t nbytes_half = ggml_nbytes(tensor)/2; + ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half); + ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half); } else { GGML_ABORT("fatal error"); } @@ -145,6 +150,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i])); } else if (t->type == GGML_TYPE_F32) { tv.push_back(*(float *) &buf[i]); + } else if (t->type == GGML_TYPE_I64) { + tv.push_back((float)*(int64_t *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { tv.push_back((float)*(int32_t *) &buf[i]); } else if (t->type == GGML_TYPE_I16) { @@ -1116,6 +1123,71 @@ struct test_get_rows : public test_case { } }; +// GGML_OP_ARGMAX +struct test_argmax : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_argmax(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 100, 1, 1}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_argmax(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + double max_nmse_err() override { + return 0.0; + } +}; + +// GGML_OP_COUNT_EQUAL +struct test_count_equal : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_count_equal(ggml_type type = GGML_TYPE_F32, + std::array ne = {4, 500, 1, 1}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * a_argmax = ggml_argmax(ctx, a); + ggml_set_name(a_argmax, "a_argmax"); + + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + + ggml_tensor * b_argmax = ggml_argmax(ctx, a); + ggml_set_name(b_argmax, "b_argmax"); + + ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax); + ggml_set_name(out, "out"); + + return out; + } + + double max_nmse_err() override { + return 0.0; + } +}; + // GGML_OP_REPEAT struct test_repeat : public test_case { const ggml_type type; @@ -3260,6 +3332,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); + test_cases.emplace_back(new test_argmax()); + test_cases.emplace_back(new test_count_equal()); + for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1})); @@ -3278,8 +3353,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous - test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) { test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim)); From 1bb8a64ebfcbe599dacb4fc8069731b6cba0b5d6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 3 Oct 2024 21:17:49 +0300 Subject: [PATCH 39/40] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 23c24899e..3d79c9ac9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -4de6ee8e6a4b2145d6b92162bc87722fecb4ea46 +e5c233e5edbfcfa1d808b9293de9065035c40751 From d5ed2b929d85bbd7dbeecb690880f07d9d7a6077 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 3 Oct 2024 21:18:19 +0300 Subject: [PATCH 40/40] metal : remove abort (skip) (ggml/0) --- ggml/src/ggml-metal.m | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index d10f5af0b..c6a7014fc 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -2993,7 +2993,6 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ABORT("capture failed"); } else { [ctx->capture_scope beginScope]; ctx->capture_started = true;