diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..b7e8b8ff2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "kompute"] + path = kompute + url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 668669c6d..01f01bfee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ option(LLAMA_CLBLAST "llama: use CLBlast" option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) +option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) @@ -442,6 +443,161 @@ if (LLAMA_HIPBLAS) endif() endif() +if (LLAMA_KOMPUTE) + add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) + find_package(Vulkan COMPONENTS glslc REQUIRED) + find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) + if (NOT glslc_executable) + message(FATAL_ERROR "glslc not found") + endif() + + function(compile_shader) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) + + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() + endfunction() + + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") + add_subdirectory(kompute) + + # Compile our shaders + compile_shader(SOURCES + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_mulrow.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp + ) + + # Create a custom target for our generated shaders + add_custom_target(generated_shaders DEPENDS + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_mulrow.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h + ) + + # Create a custom command that depends on the generated_shaders + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + DEPENDS generated_shaders + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" + ) + + # Add the stamp to the main sources to ensure dependency tracking + set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + add_compile_definitions(GGML_USE_KOMPUTE) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) + else() + message(WARNING "Kompute not found") + endif() +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -758,11 +914,12 @@ add_library(ggml OBJECT ggml-backend.h ggml-quants.c ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) diff --git a/common/common.cpp b/common/common.cpp index 4e89fe516..4a6241fb5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -543,9 +543,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -554,9 +553,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers_draft = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -565,25 +563,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - + if (split_arg.size() >= LLAMA_MAX_DEVICES) { + invalid_param = true; + break; + } for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); @@ -591,14 +608,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -909,14 +920,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" number of layers to store in VRAM\n"); printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CUBLAS + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif printf(" -gan N, --grp-attn-n N\n"); printf(" group-attention factor (default: %d)\n", params.grp_attn_n); @@ -1033,6 +1045,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; diff --git a/common/common.h b/common/common.h index e2bbfc258..5152b36d3 100644 --- a/common/common.h +++ b/common/common.h @@ -59,6 +59,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 57596ed98..7924db267 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -88,7 +88,10 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); + const std::vector t_split (LLAMA_MAX_DEVICES, 0.0f); + model_params.n_gpu_layers = n_gpu_layers; + model_params.tensor_split = t_split.data(); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 5ea67051f..19bad56cd 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,6 +31,10 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if defined(GGML_USE_KOMPUTE) +#include "ggml-kompute.h" +#endif + static llama_context ** g_ctx; static llama_model ** g_model; static gpt_params * g_params; @@ -182,6 +186,10 @@ int main(int argc, char ** argv) { g_model = &model; g_ctx = &ctx; +#if defined(GGML_USE_KOMPUTE) + ggml_vk_init_device(0, "gpu"); +#endif + // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); std::tie(model, ctx) = llama_init_from_gpt_params(params); diff --git a/ggml-alloc.c b/ggml-alloc.c index a27dd54b0..7836f064e 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -229,6 +229,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + ggml_backend_buffer_reset(alloc->buffer); } } @@ -779,10 +780,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (nbytes == 0) { // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif return NULL; } ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); + if (buffer == NULL) { + // failed to allocate buffer +#ifndef NDEBUG + fprintf(stderr, "%s: failed to allocate buffer\n", __func__); +#endif + return NULL; + } + ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index ca21b4743..859e923e2 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -16,9 +16,10 @@ extern "C" { typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { + const char * (*get_name) (ggml_backend_buffer_type_t buft); ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) @@ -34,16 +35,17 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer) (ggml_backend_buffer_t buffer); - //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + const char * (*get_name) (ggml_backend_buffer_t buffer); + void (*free_buffer) (ggml_backend_buffer_t buffer); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -51,6 +53,7 @@ extern "C" { ggml_backend_buffer_type_t buft; ggml_backend_buffer_context_t context; size_t size; + enum ggml_backend_buffer_usage usage; }; ggml_backend_buffer_t ggml_backend_buffer_init( @@ -79,13 +82,13 @@ extern "C" { void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to_async) (ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); void (*synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); diff --git a/ggml-backend.c b/ggml-backend.c index 53e741cb8..7a0a426e8 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -15,6 +15,10 @@ // backend buffer type +const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); +} + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY }; return buffer; } +const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name(buffer); +} + void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer == NULL) { return; @@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t } size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); + return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -106,13 +115,23 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { } bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } -ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { +void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { + buffer->usage = usage; +} + +ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { return buffer->buft; } +void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { + if (buffer->iface.reset) { + buffer->iface.reset(buffer); + } +} + // backend const char * ggml_backend_name(ggml_backend_t backend) { @@ -295,6 +314,12 @@ static void ggml_backend_registry_init(void) { extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); #endif + +#ifdef GGML_USE_KOMPUTE + extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data); + extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void); + ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL); +#endif } void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { @@ -392,6 +417,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU +static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { + return "CPU"; + + GGML_UNUSED(buffer); +} + static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } @@ -412,13 +443,13 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); @@ -429,6 +460,7 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static struct ggml_backend_buffer_i cpu_backend_buffer_i = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -437,10 +469,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // for buffers from ptr, free is not called static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -449,10 +483,17 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU"; + + GGML_UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -483,6 +524,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -501,6 +543,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { + return "CPU_HBM"; + + GGML_UNUSED(buf); +} + static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } @@ -514,17 +568,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return NULL; } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; } -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -568,7 +623,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -661,7 +716,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cpu_name; + return backend && backend->iface.get_name == ggml_backend_cpu_name; } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { @@ -685,7 +740,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user // scheduler -#define GGML_MAX_BACKENDS 4 +#define GGML_MAX_BACKENDS 16 #define GGML_MAX_SPLITS 256 #define GGML_MAX_SPLIT_INPUTS 16 @@ -695,9 +750,16 @@ struct ggml_backend_sched_split { int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; int n_inputs; + // graph view of this split struct ggml_cgraph graph; }; +// TODO: group all the hash values into a single struct for clarity +//struct sched_hash_value { +// ggml_tallocr_t tallocr; +// struct ggml_tensor * copies[GGML_MAX_BACKENDS]; +//}; + struct ggml_backend_sched { int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; @@ -705,11 +767,15 @@ struct ggml_backend_sched { ggml_gallocr_t galloc; + // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - ggml_tallocr_t * node_talloc; // [hash_set.size] - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] + // hash values (arrays of [hash_set.size]) + ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) + struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // copy of the graph with modified inputs struct ggml_cgraph * graph; + struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; int n_splits; @@ -777,7 +843,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc } #if 0 -static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -790,6 +856,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there // ie. kv cache updates // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. + // dst ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer); if (cur_backend != NULL) { @@ -804,7 +871,6 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct } // src - int cur_prio = INT_MAX; size_t cur_size = 0; for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -812,16 +878,20 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct if (src == NULL) { break; } + ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer); - if (src_backend != NULL) { - int src_prio = sched_backend_prio(sched, src_backend); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - cur_backend = src_backend; - SET_CAUSE(node, "1.src%d", i); - } + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + // operations with weights are always on the same backend as the weights + cur_backend = src_backend; + SET_CAUSE(node, "1.wgt%d", i); + break; + } + + size_t src_size = ggml_nbytes(src); + if (src_size >= cur_size) { + cur_size = src_size; + cur_backend = src_backend; + SET_CAUSE(node, "1.src%d", i); } } return cur_backend; @@ -857,7 +927,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t node_allocr = node_allocr(node); ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -866,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t src_allocr = node_allocr(src); ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; - fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, + fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } fprintf(stderr, "\n"); @@ -882,14 +952,16 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co return dup; } + +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend // TODO: merge passes static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - // reset state - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + // reset splits sched->n_splits = 0; struct ggml_init_params params = { @@ -898,11 +970,13 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g /* .no_alloc = */ true }; - if (sched->ctx != NULL) { - ggml_free(sched->ctx); - } + ggml_free(sched->ctx); sched->ctx = ggml_init(params); + if (sched->ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + GGML_ASSERT(false); + } // pass 1: assign backends to ops with allocated inputs for (int i = 0; i < graph->n_leafs; i++) { @@ -931,45 +1005,91 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); } } - //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS1 + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 2: assign backends to ops from current assignments - // TODO: - // - reuse sched_backend_from_cur - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { - int cur_prio = INT_MAX; - size_t cur_size = 0; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; - } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != NULL) { - int src_prio = sched_allocr_prio(sched, src_allocr); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - node_allocr = src_allocr; - SET_CAUSE(node, "2.src%d", j); - } - } + // start from the end and assign the same backend to previous ops + + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu + // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops + + // pass 2.1 expand gpu up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; } + ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { - node_allocr(node) = node_allocr; + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; + } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.cur"); } } } - //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); - // pass 3: assign backends to remaining src from dst (should only be leafs) + // pass 2.2 expand gpu down + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; + } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.cur"); + } + } + } + + // pass 2.3 expand rest up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.cur"); + } + } + } +#ifdef DEBUG_PASS2 + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif + + // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); + ggml_tallocr_t cur_allocr = node_allocr(node); + if (ggml_is_view_op(node->op) && cur_allocr == NULL) { + cur_allocr = node_allocr(node) = node_allocr(node->view_src); + SET_CAUSE(node, "3.vsrc"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -977,81 +1097,100 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } ggml_tallocr_t src_allocr = node_allocr(src); if (src_allocr == NULL) { - node_allocr(src) = node_allocr; + if (src->view_src != NULL) { + // views are always on the same backend as the source + node_allocr(src) = node_allocr(src->view_src); + } else { + node_allocr(src) = cur_allocr; + } } } } - //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS3 + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 4: split graph, find tensors that need to be copied - // TODO: - // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost - // find first backend - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (node->view_src == NULL) { - sched->splits[0].tallocr = node_allocr(node); - break; - } - } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - - if (ggml_is_view_op(node->op)) { - continue; - } - - ggml_tallocr_t node_allocr = node_allocr(node); - - if (node_allocr != cur_allocr) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); - } - - // find inputs that are not on the same backend - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + { + int cur_split = 0; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (node->view_src == NULL) { + sched->splits[0].tallocr = node_allocr(node); break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr) { - int n_inputs = sched->splits[cur_split].n_inputs++; - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; + } + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; - // create copies - size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); - ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + if (ggml_is_view_op(node->op)) { + continue; + } + + ggml_tallocr_t node_allocr = node_allocr(node); + + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + GGML_ASSERT(cur_split < GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); + } + + // find inputs that are not on the same backend + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != node_allocr) { + // check if the input is already in the split + bool found = false; + for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { + if (sched->splits[cur_split].inputs[k] == src) { + found = true; + break; + } + } + + if (!found) { + int n_inputs = sched->splits[cur_split].n_inputs++; + //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr))); + GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; + } + + // create a copy of the input in the split's backend + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; } - node->src[j] = sched->node_copies[id][cur_backend_id]; } } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; +#ifdef DEBUG_PASS4 + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); - -#if 1 +#ifndef NDEBUG // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1059,6 +1198,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (node_allocr == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } + if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", + node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1070,8 +1214,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); } + if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", + src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", + src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + } } } + fflush(stderr); #endif // create copies of the graph for each split @@ -1085,6 +1235,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + // add a dependency to the input source so that it is not freed before the copy is done input_cpy->src[0] = input; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } @@ -1121,19 +1272,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; if (input->buffer == NULL) { + GGML_ASSERT(false); if (input->view_src == NULL) { fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); - exit(1); + GGML_ASSERT(false); } // FIXME: may need to use the sched buffer instead ggml_backend_view_init(input->view_src->buffer, input); } if (input_cpy->buffer == NULL) { fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); - exit(1); + GGML_ASSERT(false); } - //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); - //GGML_ASSERT(input_cpy->buffer->backend == split_backend); + // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change + // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times ggml_backend_tensor_copy(input, input_cpy); } // ggml_backend_synchronize(split_backend); @@ -1168,13 +1320,23 @@ static void sched_reset(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_backends; i++) { ggml_tallocr_reset(sched->tallocs[i]); } + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); } -ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { +ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) { + GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); - struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); - memset(sched, 0, sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + + // initialize hash table + sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); + sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { @@ -1199,6 +1361,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_tallocr_free(sched->tallocs[i]); } ggml_gallocr_free(sched->galloc); + ggml_free(sched->ctx); free(sched->hash_set.keys); free(sched->node_talloc); free(sched->node_copies); @@ -1206,12 +1369,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - // initialize hash tables - size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; - sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); @@ -1227,7 +1385,7 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr } void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); sched_split_graph(sched, graph); sched_alloc_splits(sched); @@ -1235,13 +1393,19 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg sched_reset(sched); } +int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { + return sched->n_splits; +} + ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return sched->tallocs[backend_index]; } ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); } @@ -1252,9 +1416,10 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml } // utils + void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); @@ -1320,6 +1485,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { + graph_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1353,6 +1519,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_allocated = ggml_init(params); struct ggml_context * ctx_unallocated = ggml_init(params); + if (ctx_allocated == NULL || ctx_unallocated == NULL) { + fprintf(stderr, "failed to allocate context for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } + // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1361,6 +1542,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); + if (buffer == NULL) { + fprintf(stderr, "failed to allocate buffer for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024); @@ -1397,8 +1592,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); + if (copy.buffer == NULL) { + return false; + } + struct ggml_cgraph * g1 = graph; struct ggml_cgraph * g2 = copy.graph; @@ -1428,4 +1627,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t } ggml_backend_graph_copy_free(copy); + + return true; } diff --git a/ggml-backend.h b/ggml-backend.h index 85ff67b0e..c4eff546a 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -17,22 +17,32 @@ extern "C" { // // buffer type - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); + enum ggml_backend_buffer_usage { + GGML_BACKEND_BUFFER_USAGE_ANY = 0, + GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, + }; + + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + // // Backend @@ -140,24 +150,23 @@ extern "C" { typedef struct ggml_backend_sched * ggml_backend_sched_t; // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends); - - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + // Get the number of splits of the last graph + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); - // Allocate a graph on the backend scheduler + // Allocate and compute graph on the backend scheduler GGML_API void ggml_backend_sched_graph_compute( ggml_backend_sched_t sched, struct ggml_cgraph * graph); - // // Utils // @@ -176,7 +185,7 @@ extern "C" { typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends - GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e26260a35..b5a4a7349 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8,8 +8,13 @@ #include #include #include +#include #include - +#include +#include +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" #if defined(GGML_USE_HIPBLAS) #include @@ -77,6 +82,7 @@ #define cudaMemcpyKind hipMemcpyKind #define cudaMemset hipMemset #define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags @@ -112,10 +118,6 @@ #endif // defined(GGML_USE_HIPBLAS) -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" - #define CC_PASCAL 600 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 @@ -553,7 +555,7 @@ static void ggml_cuda_set_device(const int device) { static int g_device_count = -1; static int g_main_device = 0; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static std::array g_default_tensor_split = {}; struct cuda_device_capabilities { int cc; // compute capability @@ -564,10 +566,6 @@ struct cuda_device_capabilities { static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} }; -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 0; // disabled by default -static size_t g_scratch_offset = 0; - static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; [[noreturn]] @@ -7329,8 +7327,9 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); - g_tensor_split[id] = total_vram; + g_default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; + #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else @@ -7339,7 +7338,7 @@ void ggml_init_cublas() { g_device_caps[id].smpb = prop.sharedMemPerBlock; } for (int id = 0; id < g_device_count; ++id) { - g_tensor_split[id] /= total_vram; + g_default_tensor_split[id] /= total_vram; } for (int id = 0; id < g_device_count; ++id) { @@ -7363,30 +7362,6 @@ void ggml_init_cublas() { } } -void ggml_cuda_set_tensor_split(const float * tensor_split) { - if (tensor_split == nullptr) { - return; - } - bool all_zero = true; - for (int i = 0; i < g_device_count; ++i) { - if (tensor_split[i] != 0.0f) { - all_zero = false; - break; - } - } - if (all_zero) { - return; - } - float split_sum = 0.0f; - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] /= split_sum; - } -} - void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; @@ -7838,11 +7813,11 @@ static void ggml_cuda_op_mul_mat_q( (void) src1_ddf_i; } -static int64_t get_row_rounding(ggml_type type) { +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; for (int id = 0; id < g_device_count; ++id) { - if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { if (min_compute_capability > g_device_caps[id].cc) { min_compute_capability = g_device_caps[id].cc; } @@ -7901,6 +7876,21 @@ static int64_t get_row_rounding(ggml_type type) { #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + + if (id == g_device_count - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + static void ggml_cuda_op_mul_mat_vec_q( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, @@ -8515,6 +8505,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { peer_access_enabled = enable_peer_access; } +// FIXME: move this somewhere else +struct ggml_backend_cuda_split_buffer_type_context { + std::array tensor_split; +}; + static void ggml_cuda_op_mul_mat( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const bool convert_src1_to_q8_1) { @@ -8566,6 +8561,14 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); + std::array tensor_split; + if (split) { + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + tensor_split = buft_ctx->tensor_split; + } + struct dev_data { cuda_pool_alloc src0_dd_alloc; cuda_pool_alloc src1_ddf_alloc; @@ -8593,17 +8596,17 @@ static void ggml_cuda_op_mul_mat( // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { - const int64_t rounding = get_row_rounding(src0->type); + const int64_t rounding = get_row_rounding(src0->type, tensor_split); if (id != 0) { - dev[id].row_low = ne01*g_tensor_split[id]; + dev[id].row_low = ne01*tensor_split[id]; if (dev[id].row_low < ne01) { dev[id].row_low -= dev[id].row_low % rounding; } } if (id != g_device_count - 1) { - dev[id].row_high = ne01*g_tensor_split[id + 1]; + dev[id].row_high = ne01*tensor_split[id + 1]; if (dev[id].row_high < ne01) { dev[id].row_high -= dev[id].row_high % rounding; } @@ -9149,10 +9152,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; - for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_device_caps[id].cc; + + if (split) { + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + auto & tensor_split = buft_ctx->tensor_split; + for (int id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_device_caps[id].cc; + } } + } else { + min_compute_capability = g_device_caps[g_main_device].cc; } #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) @@ -9191,7 +9201,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { @@ -9653,247 +9663,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { - const int64_t nrows = ggml_nrows(tensor); - - const int64_t ne0 = tensor->ne[0]; - - const size_t nb1 = tensor->nb[1]; - - ggml_backend_type backend = tensor->backend; - ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - - for (int id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device) { - continue; - } - - ggml_cuda_set_device(id); - - int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { - row_low = 0; - row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { - const int64_t rounding = get_row_rounding(tensor->type); - - row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % rounding; - - if (id == g_device_count - 1) { - row_high = nrows; - } else { - row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % rounding; - } - } else { - GGML_ASSERT(false); - } - if (row_low == row_high) { - continue; - } - - int64_t nrows_split = row_high - row_low; - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char *)data + offset_split; - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); - - extra->data_device[id] = buf; - - if (backend == GGML_BACKEND_GPU_SPLIT) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); - } - } - } - - tensor->extra = extra; -} - -void ggml_cuda_free_data(struct ggml_tensor * tensor) { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { - return; - } - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - - for (int id = 0; id < g_device_count; ++id) { - ggml_cuda_set_device(id); - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - } - - delete extra; -} - -static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; -static size_t g_temp_tensor_extra_index = 0; - -static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { - if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; - } - - size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; - ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; - memset(extra, 0, sizeof(*extra)); - - return extra; -} - -static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { - if (scratch && g_scratch_size == 0) { - return; - } - - tensor->backend = GGML_BACKEND_GPU; - - // recursively assign CUDA buffers until a compute tensor is found - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { - const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { - ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); - } - } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); - } - - if (scratch && no_alloc) { - return; - } - - ggml_tensor_extra_gpu * extra; - - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW || - force_inplace; - const size_t size = ggml_nbytes(tensor); - - ggml_cuda_set_device(g_main_device); - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->op_params, sizeof(size_t)); - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src0_ddc + offset; - } else if (tensor->op == GGML_OP_CPY) { - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; - void * src1_ddv = src1_extra->data_device[g_main_device]; - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src1_ddv; - } else if (scratch) { - GGML_ASSERT(size <= g_scratch_size); - if (g_scratch_offset + size > g_scratch_size) { - g_scratch_offset = 0; - } - - char * data = (char *) g_scratch_buffer; - if (data == nullptr) { - CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); - g_scratch_buffer = data; - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = data + g_scratch_offset; - - g_scratch_offset += size; - - GGML_ASSERT(g_scratch_offset <= g_scratch_size); - } else { // allocate new buffers outside of scratch - void * data; - CUDA_CHECK(cudaMalloc(&data, size)); - CUDA_CHECK(cudaMemset(data, 0, size)); - extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - extra->data_device[g_main_device] = data; - } - - tensor->extra = extra; -} - -void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) { - if (g_scratch_size == 0) { - return; - } - if (g_scratch_buffer == nullptr) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); - } - - ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - - const bool inplace = tensor->view_src != nullptr; - - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t view_offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&view_offset, tensor->op_params, sizeof(size_t)); - } - extra->data_device[g_main_device] = src0_ddc + view_offset; - } else { - extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; - } - - tensor->extra = extra; -} - -void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - GGML_ASSERT(ggml_is_contiguous(tensor)); - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); -} - -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, false); -} - -void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, true); -} - -void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, false, false); -} - -void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, true, false); -} - -void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -9902,30 +9672,12 @@ void ggml_cuda_set_main_device(const int main_device) { if (g_main_device != main_device && g_device_count > 1) { g_main_device = main_device; - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); - fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); + //cudaDeviceProp prop; + //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); + //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); } } -void ggml_cuda_set_scratch_size(const size_t scratch_size) { - // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously - // it still won't always work as expected, but it's better than nothing - if (scratch_size > g_scratch_size) { - ggml_cuda_free_scratch(); - } - g_scratch_size = std::max(g_scratch_size, scratch_size); -} - -void ggml_cuda_free_scratch() { - if (g_scratch_buffer == nullptr) { - return; - } - - CUDA_CHECK(cudaFree(g_scratch_buffer)); - g_scratch_buffer = nullptr; -} - bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; @@ -10104,6 +9856,11 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des #define UNUSED GGML_UNUSED +struct ggml_backend_context_cuda { + int device; + std::string name; +}; + // cuda buffer struct ggml_backend_buffer_context_cuda { @@ -10111,8 +9868,12 @@ struct ggml_backend_buffer_context_cuda { void * dev_ptr = nullptr; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; + std::string name; - ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : + device(device), dev_ptr(dev_ptr), + name(GGML_CUDA_NAME + std::to_string(device)) { + } ~ggml_backend_buffer_context_cuda() { delete[] temp_tensor_extras; @@ -10132,6 +9893,11 @@ struct ggml_backend_buffer_context_cuda { } }; +static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + return ctx->name.c_str(); +} + static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); @@ -10173,8 +9939,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0])); } } - - UNUSED(buffer); } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -10184,8 +9948,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -10195,7 +9959,6 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } @@ -10204,11 +9967,12 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); + CUDA_CHECK(cudaDeviceSynchronize()); } -static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { +static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, @@ -10217,23 +9981,39 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type -static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - int device = (int) (intptr_t) buft->context; +struct ggml_backend_cuda_buffer_type_context { + int device; + std::string name; +}; - ggml_cuda_set_device(device); +static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + ggml_cuda_set_device(buft_ctx->device); size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 void * dev_ptr; - CUDA_CHECK(cudaMalloc(&dev_ptr, size)); + cudaError_t err = cudaMalloc(&dev_ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + return nullptr; + } - ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); + ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr); - return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -10242,7 +10022,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10263,21 +10043,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t } static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); + if (!ggml_backend_is_cuda(backend)) { + return false; + } - UNUSED(buft); + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + return buft_ctx->device == cuda_ctx->device; } static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, - /* .is_host = */ nullptr, + /* .is_host = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + // FIXME: this is not thread safe + if (device >= ggml_backend_cuda_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; static bool ggml_backend_cuda_buffer_type_initialized = false; @@ -10285,7 +10076,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { ggml_backend_cuda_buffer_types[i] = { /* .iface = */ ggml_backend_cuda_buffer_type_interface, - /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, }; } ggml_backend_cuda_buffer_type_initialized = true; @@ -10294,8 +10085,298 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { return &ggml_backend_cuda_buffer_types[device]; } +// cuda split buffer + +struct ggml_backend_cuda_split_buffer_context { + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < g_device_count; ++id) { + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + CUDA_CHECK(cudaFree(extra->data_device[id])); + } + delete extra; + } + } + + std::vector tensor_extras; +}; + +static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buffer); +} + +static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + UNUSED(buffer); +} + +static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + + ctx->tensor_extras.push_back(extra); + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); + char * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + + extra->data_device[id] = buf; + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } + } + tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->extra = extra; +} + +static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice)); + } +} + +static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost)); + } +} + +static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + UNUSED(buffer); + UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, +}; + +// cuda split buffer type + +static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + UNUSED(buft); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + UNUSED(buft); +} + +static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, + /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, +}; + +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + // FIXME: this is not thread safe + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = g_default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + // host buffer type +static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buft); +} + +static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buffer); +} + static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } @@ -10308,9 +10389,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; @@ -10319,6 +10400,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { + /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, @@ -10333,14 +10415,10 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend -struct ggml_backend_context_cuda { - int device; -}; - static const char * ggml_backend_cuda_name(ggml_backend_t backend) { - return GGML_CUDA_NAME; + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; - UNUSED(backend); + return cuda_ctx->name.c_str(); } static void ggml_backend_cuda_free(ggml_backend_t backend) { @@ -10382,29 +10460,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { - GGML_ASSERT(!"not implemented"); - - return nullptr; - - UNUSED(backend); - UNUSED(cgraph); -} - -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; @@ -10419,46 +10474,25 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) continue; - assert(node->backend == GGML_BACKEND_GPU); +#ifndef NDEBUG + assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); + //assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->src[j]->extra != nullptr); } } +#endif bool ok = ggml_cuda_compute_forward(¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); - -#if 0 - if (node->type == GGML_TYPE_F32) { - cudaDeviceSynchronize(); - std::vector tmp(ggml_nelements(node), 0.0f); - cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), - ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none", - node->src[0]->name, - node->src[1] ? node->src[1]->name : "none"); - double sum = 0.0; - double sq_sum = 0.0; - for (int i = 0; i < ggml_nelements(node); i++) { - printf("%f ", tmp[i]); - sum += tmp[i]; - sq_sum += tmp[i]*tmp[i]; - } - printf("\n"); - printf("sum: %f, ", sum); - printf("sq_sum: %f\n", sq_sum); - } -#endif } UNUSED(backend); @@ -10577,7 +10611,7 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten UNUSED(backend); } -static ggml_backend_i cuda_backend_i = { +static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, @@ -10586,9 +10620,9 @@ static ggml_backend_i cuda_backend_i = { /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, }; @@ -10605,11 +10639,12 @@ ggml_backend_t ggml_backend_cuda_init(int device) { ggml_cuda_set_main_device(device); ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { - /* .device = */ device + /* .device = */ device, + /* .name = */ GGML_CUDA_NAME + std::to_string(device), }; ggml_backend_t cuda_backend = new ggml_backend { - /* .interface = */ cuda_backend_i, + /* .interface = */ ggml_backend_cuda_interface, /* .context = */ ctx }; @@ -10617,9 +10652,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cuda_name; + return backend && backend->iface.get_name == ggml_backend_cuda_name; } +int ggml_backend_cuda_get_device_count() { + return ggml_cuda_get_device_count(); +} + +void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { + ggml_cuda_get_device_description(device, description, description_size); +} + +void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { + ggml_cuda_set_device(device); + + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +// backend registry static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; diff --git a/ggml-cuda.h b/ggml-cuda.h index cdb0c0c41..d19cbf3fd 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size); GGML_API void ggml_cuda_host_free(void * ptr); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); -GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); -GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); -GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_set_main_device(int main_device); -GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); -GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); -GGML_API void ggml_cuda_free_scratch(void); GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); GGML_API int ggml_cuda_get_device_count(void); @@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, GGML_API ggml_backend_t ggml_backend_cuda_init(int device); GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); -GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); - -// pinned host buffer for use with CPU backend for faster copies between CPU and GPU +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API int ggml_backend_cuda_get_device_count(void); +GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); + #ifdef __cplusplus } #endif diff --git a/ggml-impl.h b/ggml-impl.h index 2faced080..2c58075ac 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_HASHTABLE_FULL ((size_t)-1) #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set ggml_hash_set_new(size_t size); + bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp new file mode 100644 index 000000000..520cd1fd7 --- /dev/null +++ b/ggml-kompute.cpp @@ -0,0 +1,1878 @@ +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-kompute.h" + +// These are generated at build time by cmake custom command +#include "shaderop_scale.h" +#include "shaderop_scale_8.h" +#include "shaderop_add.h" +#include "shaderop_addrow.h" +#include "shaderop_mul.h" +#include "shaderop_mulrow.h" +#include "shaderop_silu.h" +#include "shaderop_relu.h" +#include "shaderop_gelu.h" +#include "shaderop_softmax.h" +#include "shaderop_norm.h" +#include "shaderop_rmsnorm.h" +#include "shaderop_diagmask.h" +#include "shaderop_mul_mat_f16.h" +#include "shaderop_mul_mat_q8_0.h" +#include "shaderop_mul_mat_q4_0.h" +#include "shaderop_mul_mat_q4_1.h" +#include "shaderop_mul_mat_q6_k.h" +#include "shaderop_mul_mat_mat_f32.h" +#include "shaderop_getrows_f16.h" +#include "shaderop_getrows_q4_0.h" +#include "shaderop_getrows_q4_1.h" +#include "shaderop_getrows_q6_k.h" +#include "shaderop_rope_f16.h" +#include "shaderop_rope_f32.h" +#include "shaderop_cpy_f16_f16.h" +#include "shaderop_cpy_f16_f32.h" +#include "shaderop_cpy_f32_f16.h" +#include "shaderop_cpy_f32_f32.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 +#define QK_NL 16 + +typedef ggml_fp16_t half; +struct ggml_kompute_context { + std::shared_ptr pool; +}; + +// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object +// and consolidate the init functions and simplify object lifetime management. As it currently stands, +// we *have* to have the kompute manager no matter what for device discovery, but the kompute context +// is only created when a device is set and vulkan is explicitly turned on. +static ggml_kompute_context *s_kompute_context = nullptr; + +class kompute_manager { + kp::Manager *s_mgr = nullptr; + +public: + kp::Manager *operator()() { + if (s_mgr && !s_mgr->hasInstance()) { + destroy(); + } + if (!s_mgr) { + s_mgr = new kp::Manager; + } + return s_mgr; + } + + void destroy() { + delete s_mgr; + s_mgr = nullptr; + } +}; + +static kompute_manager komputeManager; + +#ifdef __linux__ +__attribute__((constructor)) +static void enable_sam() { + setenv("RADV_PERFTEST", "sam", false); +} +#endif + +static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevice) { + vk::PhysicalDeviceFeatures availableFeatures; + physicalDevice.getFeatures(&availableFeatures); + + if (!availableFeatures.shaderInt16) + return false; + + vk::PhysicalDeviceVulkan11Features availableFeatures11; + vk::PhysicalDeviceVulkan12Features availableFeatures12; + + availableFeatures11.pNext = &availableFeatures12; + availableFeatures12.pNext = nullptr; + + vk::PhysicalDeviceFeatures2 features2; + features2.pNext = &availableFeatures11; + + physicalDevice.getFeatures2(&features2); + + if (!availableFeatures11.uniformAndStorageBuffer16BitAccess || + !availableFeatures11.storageBuffer16BitAccess) { + return false; + } + + if (!availableFeatures12.storageBuffer8BitAccess || + !availableFeatures12.uniformAndStorageBuffer8BitAccess || + !availableFeatures12.shaderFloat16 || + !availableFeatures12.shaderInt8) { + return false; + } + + return true; +} + +static std::string ggml_vk_getVendorName(uint32_t vendorID) { + switch (vendorID) { + case 0x10DE: + return "nvidia"; + case 0x1002: + return "amd"; + case 0x8086: + return "intel"; + default: + return "unknown"; + } +} + +std::vector ggml_vk_available_devices(size_t memoryRequired) { + std::vector results; + if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance()) + return results; + + std::vector physicalDevices; + try { + physicalDevices = komputeManager()->listDevices(); + } catch (vk::SystemError & err) { + std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n"; + return results; + } + + uint32_t deviceCount = physicalDevices.size(); + if (deviceCount == 0) + return results; + + std::unordered_map count_by_name; + + for (uint32_t i = 0; i < deviceCount; i++) { + VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties(); + VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties(); + const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion); + const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion); + if (major < 1 || minor < 2) + continue; + + if (!ggml_vk_checkPhysicalDeviceFeatures(physicalDevices.at(i))) + continue; + + size_t heapSize = 0; + for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) { + VkMemoryHeap heap = memoryProperties.memoryHeaps[j]; + if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { + heapSize = heap.size; + break; + } + } + + if (heapSize < memoryRequired) + continue; + + vk::PhysicalDeviceSubgroupProperties subgroupProperties; + vk::PhysicalDeviceProperties2 deviceProperties2; + deviceProperties2.pNext = &subgroupProperties; + physicalDevices.at(i).getProperties2(&deviceProperties2); + + if (subgroupProperties.subgroupSize < 32) + continue; + + ggml_vk_device d; + d.index = i; + d.type = properties.deviceType; + d.heapSize = heapSize; + d.name = properties.deviceName; + d.subgroupSize = subgroupProperties.subgroupSize; + size_t n_idx = ++count_by_name[d.name]; + if (n_idx > 1) { + d.name += " (" + std::to_string(n_idx) + ")"; + } + d.vendor = ggml_vk_getVendorName(properties.vendorID); + results.push_back(d); + } + + std::stable_sort(results.begin(), results.end(), + [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool { + if (lhs.type != rhs.type) { + if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true; + if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false; + + if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true; + if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false; + } + return lhs.heapSize < rhs.heapSize; + } + ); + + return results; +} + +static void ggml_vk_filterByVendor(std::vector& devices, const std::string& targetVendor) { + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [&targetVendor](const ggml_vk_device& device) { + return device.vendor != targetVendor; + }), + devices.end() + ); +} + +static void ggml_vk_filterByName(std::vector& devices, const std::string& targetName) { + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [&targetName](const ggml_vk_device& device) { + return device.name != targetName; + }), + devices.end() + ); +} + +bool ggml_vk_init_device(size_t memoryRequired, const std::string &device) { + if (device.empty()) + return false; + + std::vector devices = ggml_vk_available_devices(memoryRequired); + if (device == "gpu") { + if (devices.size() != 0) + return ggml_vk_init_device(devices.front()); + } else if (device == "amd" || device == "nvidia" || device == "intel") { + ggml_vk_filterByVendor(devices, device); + if (devices.size() != 0) + return ggml_vk_init_device(devices.front()); + } else { + ggml_vk_filterByName(devices, device); + if (devices.size() != 0) + return ggml_vk_init_device(devices.front()); + } + + return ggml_vk_has_device(); +} + +bool ggml_vk_init_device(const ggml_vk_device &device) { + return ggml_vk_init_device(device.index); +} + +bool ggml_vk_init_device(int device) { + komputeManager()->initializeDevice(device, {}, + {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", + "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"}); + return ggml_vk_has_device(); +} + +bool ggml_vk_free_device() { + if (!ggml_vk_has_device()) + return false; + komputeManager.destroy(); + // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact + // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which + // is very brittle + s_kompute_context = nullptr; + return true; +} + +bool ggml_vk_has_vulkan() { + return komputeManager()->hasVulkan(); +} + +bool ggml_vk_has_device() { + return komputeManager()->hasDevice(); +} + +bool ggml_vk_using_vulkan() { + return s_kompute_context != nullptr; +} + +ggml_vk_device ggml_vk_current_device() { + if (!komputeManager()->hasDevice()) + return ggml_vk_device(); + + std::vector devices = ggml_vk_available_devices(0); + ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName); + return devices.front(); +} + +ggml_kompute_context *ggml_vk_init() { + GGML_ASSERT(s_kompute_context == nullptr); + s_kompute_context = new ggml_kompute_context; + return s_kompute_context; +} + +void ggml_vk_free(struct ggml_kompute_context * ctx) { + assert(ctx == s_kompute_context); + s_kompute_context = nullptr; + if (ctx != nullptr) { + delete ctx; + } +} + +static +void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) { + std::vector descriptorPoolSizes = { + vk::DescriptorPoolSize( + vk::DescriptorType::eStorageBuffer, + 3 * size // Descriptor count is number of possible tensors to pass into an algorithm + ) + }; + + vk::DescriptorPoolCreateInfo descriptorPoolInfo( + vk::DescriptorPoolCreateFlags(), + size, // Max sets + static_cast(descriptorPoolSizes.size()), + descriptorPoolSizes.data()); + + ctx->pool = std::make_shared(); + vk::Result r = komputeManager()->device()->createDescriptorPool( + &descriptorPoolInfo, nullptr, ctx->pool.get()); + if (r != vk::Result::eSuccess) + std::cerr << "Error allocating descriptor pool" << vk::to_string(r); +} + +static +void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) { + if (ctx->pool) { + komputeManager()->device()->destroy( + *ctx->pool, + (vk::Optional)nullptr); + ctx->pool = nullptr; + } +} + +static +vk::Buffer *ggml_vk_allocate_buffer(size_t size) { + vk::BufferCreateInfo bufferCreateInfo; + bufferCreateInfo.size = size; + bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst; + bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive; + + vk::Buffer *vkBuffer = new vk::Buffer; + vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer); + if (r != vk::Result::eSuccess) + std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl; + return vkBuffer; +} + +static +vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) { + + uint32_t memoryTypeIndex = -1; + bool memoryTypeIndexFound = false; + vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties(); + for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) { + const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i]; + const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex]; + if (memoryHeap.size < size) { + continue; + } + + if (requirements.memoryTypeBits & (1 << i)) { + if (((memoryProperties.memoryTypes[i]).propertyFlags & + flags) == flags) { + memoryTypeIndex = i; + memoryTypeIndexFound = true; + if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) { + *isHostVisible = true; + } + break; + } + } + } + if (!memoryTypeIndexFound) { + throw std::runtime_error( + "Memory type index for buffer creation not found"); + } + + vk::MemoryAllocateInfo allocInfo; + allocInfo.allocationSize = size; + allocInfo.memoryTypeIndex = memoryTypeIndex; + vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory; + vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory); + if (r != vk::Result::eSuccess) { + std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl; + throw std::runtime_error("Error allocating vulkan memory."); + } + return vkDeviceMemory; +} + +static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) { + size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer); + if (minStorageBufferOffsetAlignment == 0) { + vk::PhysicalDeviceProperties deviceProperties; + deviceProperties = komputeManager()->physicalDevice()->getProperties(); + vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits; + minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment; + } + + // If offset is already aligned, return it directly + if (offset % minStorageBufferOffsetAlignment == 0) { + return offset; + } + + // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset + return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment; +} + +static ggml_vk_memory ggml_vk_allocate(size_t size) { + ggml_vk_memory memory; + bool isHostVisible = false; + { + memory.primaryBuffer = ggml_vk_allocate_buffer(size); + vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer); + vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal; + memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); + komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0); + if (isHostVisible) { + vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data); + if (r != vk::Result::eSuccess) + std::cerr << "Error mapping memory" << vk::to_string(r); + } + } + + if (!isHostVisible) { + memory.stagingBuffer = ggml_vk_allocate_buffer(size); + vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer); + vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible | + vk::MemoryPropertyFlagBits::eHostCoherent | + vk::MemoryPropertyFlagBits::eHostCached; + memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); + komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0); + vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data); + if (r != vk::Result::eSuccess) + std::cerr << "Error mapping memory" << vk::to_string(r); + } + + memory.size = size; + return memory; +} + +void ggml_vk_free_memory(ggml_vk_memory &memory) +{ + komputeManager()->device()->destroy( + *memory.primaryBuffer, + (vk::Optional)nullptr); + if (memory.stagingBuffer) { + komputeManager()->device()->destroy( + *memory.stagingBuffer, + (vk::Optional)nullptr); + } + komputeManager()->device()->freeMemory( + *memory.primaryMemory, + (vk::Optional)nullptr); + if (memory.stagingMemory) { + komputeManager()->device()->freeMemory( + *memory.stagingMemory, + (vk::Optional)nullptr); + } +} + +static +ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) { + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + + // compatibility with ggml-backend + GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type()); + + ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context; + + const intptr_t ioffs = reinterpret_cast(t->data) - reinterpret_cast(buf_ctx->data); + + GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size); + + offset = (uint64_t)ioffs; + return buf_ctx; +} + +static +const std::shared_ptr ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) { + uint64_t originalOffset = 0; + auto * res = ggml_vk_find_tensor(t, originalOffset); + if (!res) { + static std::shared_ptr nullTensor = nullptr; + return nullTensor; + } + + // Create a tensor whose memory will be composed of our buffers at the correct offset + const size_t nelements = ggml_nelements(t); + size_t nbytes = ggml_nbytes(t); + + size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset); + if (alignedOffset) { + *alignedOffset = originalOffset - vulkanOffset; + nbytes += *alignedOffset; + } + + return komputeManager()->tensor( + t->data, + nelements, + nbytes, kp::Tensor::TensorDataTypes::eFloat, + res->primaryMemory, res->primaryBuffer, + res->stagingMemory, res->stagingBuffer, + vulkanOffset); +} + +static std::vector getSpirvShader(const unsigned char* rawData, size_t size) { + if (size % sizeof(uint32_t) != 0) { + throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)"); + } + + const uint32_t* data_ptr = reinterpret_cast(rawData); + size_t count = size / sizeof(uint32_t); + return std::vector(data_ptr, data_ptr + count); +} + +inline static +uint32_t safe_divide(uint32_t a, uint32_t b) { + if (b <= 1) { + return a; + } + if ((a % b) != 0) { + fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); + GGML_ASSERT(!"safe_divide result would've had remainder"); + } + return a / b; +} + +static void ggml_vk_add( + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, + int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, + int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, + int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, + int32_t ne0, + int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 +) { + + const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv, + kp::shader_data::op_add_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00; + int32_t nb00, nb01, nb02, nb03; + int32_t ne10, ne11, ne12, ne13; + int32_t nb10, nb11, nb12, nb13; + int32_t ne0; + int32_t nb0, nb1, nb2, nb3; + } const pushConsts { + safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, + nb00, nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb10, nb11, nb12, nb13, + ne0, + nb0, nb1, nb2, nb3 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_addrow(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + uint32_t size, uint32_t row = 0) { + + const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv, + kp::shader_data::op_addrow_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + uint32_t row; + } const pushConsts { + safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), + row + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_mul(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + uint32_t size) { + + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv, + kp::shader_data::op_mul_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + } const pushConsts { + safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4) + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_mulrow(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + uint32_t size, uint32_t row = 0) { + + const static auto spirv = getSpirvShader(kp::shader_data::op_mulrow_comp_spv, + kp::shader_data::op_mulrow_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + uint32_t row; + } const pushConsts { + safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), + row + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_scale(kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + uint32_t size, float scale) { + const static auto spirv_1 = getSpirvShader( + kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len + ); + const static auto spirv_8 = getSpirvShader( + kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len + ); + + struct PushConstants { + uint32_t inOff, outOff; + float scale; + } const pushConsts { + safe_divide(inOff, 4), safe_divide(outOff, 4), + scale + }; + + const auto * spirv = &spirv_1; + std::string name(__func__); + if (size % 8 == 0) { + size /= 8; + name += "_8"; + spirv = &spirv_8; + } + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(name)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(name); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_xxlu(const std::vector& spirv, kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + uint32_t size) { + struct PushConstants { + uint32_t inOff, outOff; + } const pushConsts { + safe_divide(inOff, 4), safe_divide(outOff, 4), + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_silu(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv, + kp::shader_data::op_silu_comp_spv_len); + + ggml_vk_xxlu(spirv, std::forward(args)...); +} + +template +static void ggml_vk_relu(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv, + kp::shader_data::op_relu_comp_spv_len); + + ggml_vk_xxlu(spirv, std::forward(args)...); +} + +template +static void ggml_vk_gelu(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv, + kp::shader_data::op_gelu_comp_spv_len); + + ggml_vk_xxlu(spirv, std::forward(args)...); +} + +static void ggml_vk_soft_max(kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03) { + + const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, + kp::shader_data::op_softmax_comp_spv_len); + + struct PushConstants { + uint32_t inOff, outOff; + int32_t ne00, ne01, ne02; + } pushConsts { + safe_divide(inOff, 4), safe_divide(outOff, 4), + ne00, ne01, ne02 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device + const uint32_t local_x = 32; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_norm_(const std::vector& spirv, kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + int32_t ne00, int32_t nb01, + int32_t nrows, float epsilon) { + GGML_ASSERT(nb01%sizeof(float) == 0); + GGML_ASSERT(ne00%sizeof(float) == 0); + + struct PushConstants { + uint32_t inOff, outOff; + uint32_t ne00, nb01; + float eps; + } pushConsts { + safe_divide(inOff, 4), safe_divide(outOff, 4), + (uint32_t)ne00, (uint32_t)nb01, epsilon + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({(uint32_t)nrows}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_norm(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv, + kp::shader_data::op_norm_comp_spv_len); + + ggml_vk_norm_(spirv, std::forward(args)...); +} + +template +static void ggml_vk_rms_norm(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv, + kp::shader_data::op_rmsnorm_comp_spv_len); + + ggml_vk_norm_(spirv, std::forward(args)...); +} + +static void ggml_vk_diag_mask_inf(kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + uint32_t n_past, + int32_t ne00, int32_t ne01, int32_t ne02) { + const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv, + kp::shader_data::op_diagmask_comp_spv_len); + + struct PushConstants { + uint32_t inOff, outOff; + uint32_t n_past; + int32_t ne00, ne01; + } pushConsts { + safe_divide(inOff, 4), safe_divide(outOff, 4), + n_past, + ne00, ne01 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_mul_mat_f16(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, + uint32_t nb01, uint32_t nb02, + int32_t ne11, int32_t ne12, + uint32_t nb11, uint32_t nb12, + int32_t ne0, int32_t ne1) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv, + kp::shader_data::op_mul_mat_f16_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00; + uint32_t nb01, nb02; + uint32_t nb11, nb12; + int32_t ne02, ne12; + int32_t ne0, ne1; + } pushConsts { + safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, nb01, nb02, nb11, nb12, ne02, ne12, ne0, ne1, + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))}, {local_x}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_mul_mat_q8_0(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, + uint32_t nb01, uint32_t nb02, + int32_t ne11, int32_t ne12, + uint32_t nb11, uint32_t nb12, + int32_t ne0, int32_t ne1) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv, + kp::shader_data::op_mul_mat_q8_0_comp_spv_len); + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00; + uint32_t nb01, nb02; + uint32_t nb11, nb12; + int32_t ne0, ne1; + } pushConsts { + inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, nb01, nb02, nb11, nb12, ne0, ne1, + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + + +static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, + uint32_t nb01, uint32_t nb02, + int32_t ne11, int32_t ne12, + uint32_t nb11, uint32_t nb12, + uint32_t nb1, uint32_t nb2) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv, + kp::shader_data::op_mul_mat_mat_f32_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne01, ne02, ne11, ne12; + uint32_t nb01, nb02; + uint32_t nb11, nb12; + uint32_t nb1, nb2; + } pushConsts { + safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne01, ne02, ne11, ne12, + nb01, nb02, nb11, nb12, + nb1, nb2 + }; + + const uint32_t local_x = ggml_vk_current_device().subgroupSize; + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), + {inA, inB, out}, spirv, + {unsigned(ne01), + unsigned(ne11), + unsigned(std::max(ne12, ne02)) + }, + {local_x}, + {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01), + unsigned(ne11), + unsigned(std::max(ne12, ne02)), + }); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_mul_mat_q4_x(const std::vector& spirv, uint32_t block_size, kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, + int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) { + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne10, ne0, ne1, ne01, gqa; + } pushConsts { + safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne10, ne0, ne1, ne01, ne12/ne02 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_mul_mat_q4_0(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv, + kp::shader_data::op_mul_mat_q4_0_comp_spv_len); + + ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward(args)...); +} + +template +static void ggml_vk_mul_mat_q4_1(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv, + kp::shader_data::op_mul_mat_q4_1_comp_spv_len); + + ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward(args)...); +} + +static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, + int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, + kp::shader_data::op_mul_mat_q6_k_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne10, ne0, ne1, ne01, gqa; + } pushConsts { + inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, ne10, ne0, ne1, ne01, ne12/ne02 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +static void ggml_vk_get_rows(const std::vector& spirv, + unsigned element_size, unsigned qk, + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t nb01, int32_t nb1, + uint32_t size) { + GGML_ASSERT(nb01%element_size == 0); + GGML_ASSERT(nb1%sizeof(float) == 0); + if (qk) GGML_ASSERT(ne00%qk == 0); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, nb01, nb1; + } pushConsts { + safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), + ne00, nb01, nb1 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({size}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_get_rows_f16(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, + kp::shader_data::op_getrows_f16_comp_spv_len); + + ggml_vk_get_rows(spirv, sizeof(half), 0, std::forward(args)...); +} + +template +static void ggml_vk_get_rows_q4_0(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv, + kp::shader_data::op_getrows_q4_0_comp_spv_len); + + ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_0, std::forward(args)...); +} + +template +static void ggml_vk_get_rows_q4_1(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv, + kp::shader_data::op_getrows_q4_1_comp_spv_len); + + ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward(args)...); +} + +template +static void ggml_vk_get_rows_q6_k(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv, + kp::shader_data::op_getrows_q6_k_comp_spv_len); + ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward(args)...); +} + +static void ggml_vk_rope( + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx, + float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow, + int32_t ne01, int32_t ne02, int32_t ne03, + uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, + int32_t ne0, + uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 +) { + GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); + + static const auto spirv_f16 = getSpirvShader( + kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len + ); + static const auto spirv_f32 = getSpirvShader( + kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len + ); + + int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; + + GGML_ASSERT(nb03 % type_size == 0); + GGML_ASSERT(nb02 % type_size == 0); + GGML_ASSERT(nb01 % type_size == 0); + GGML_ASSERT(nb00 % type_size == 0); + GGML_ASSERT(nb3 % type_size == 0); + GGML_ASSERT(nb2 % type_size == 0); + GGML_ASSERT(nb1 % type_size == 0); + GGML_ASSERT(nb0 % type_size == 0); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t n_dims, mode, n_orig_ctx; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + uint32_t nb00, nb01, nb02, nb03; + int32_t ne0; + uint32_t nb0, nb1, nb2, nb3; + } pushConsts { + safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size), + n_dims, mode, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, + nb00, nb01, nb02, nb03, + ne0, + nb0, nb1, nb2, nb3 + }; + + auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(name)) { + s_algo = komputeManager()->algorithm( + name, s_kompute_context->pool.get(), {inA, inB, out}, + src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32, + {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} + ); + } else { + s_algo = komputeManager()->getAlgorithm(name); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_cpy(const std::vector& spirv, + kp::Sequence& seq, + const std::shared_ptr& in, + const std::shared_ptr& out, + uint32_t inOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, + uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, + int32_t ne0, int32_t ne1, int32_t ne2, + uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) { + struct PushConstants { + uint32_t inOff, outOff; + int32_t ne00, ne01, ne02; + uint32_t nb00, nb01, nb02, nb03; + int32_t ne0, ne1, ne2; + uint32_t nb0, nb1, nb2, nb3; + } pushConsts { + safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size), + ne00, ne01, ne02, + nb00, nb01, nb02, nb03, + ne0, ne1, ne2, + nb0, nb1, nb2, nb3 + }; + + static std::string unique_name = std::string(__func__) + + "_i_" + std::to_string(in_element_size) + + "_o_" + std::to_string(out_element_size); + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(unique_name)) + s_algo = komputeManager()->algorithm(unique_name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); + else { + s_algo = komputeManager()->getAlgorithm(unique_name); + s_algo->setTensors({in, out}); + s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + +template +static void ggml_vk_cpy_f32_f16(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv, + kp::shader_data::op_cpy_f32_f16_comp_spv_len); + ggml_vk_cpy<4, 2>(spirv, std::forward(args)...); +} + +template +static void ggml_vk_cpy_f32_f32(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv, + kp::shader_data::op_cpy_f32_f32_comp_spv_len); + ggml_vk_cpy<4, 4>(spirv, std::forward(args)...); +} + +template +static void ggml_vk_cpy_f16_f16(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv, + kp::shader_data::op_cpy_f16_f16_comp_spv_len); + ggml_vk_cpy<2, 2>(spirv, std::forward(args)...); +} + +template +static void ggml_vk_cpy_f16_f32(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv, + kp::shader_data::op_cpy_f16_f32_comp_spv_len); + ggml_vk_cpy<2, 4>(spirv, std::forward(args)...); +} + +static bool ggml_kompute_supports_op(const struct ggml_tensor * op) { + switch (op->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + break; + default: + return false; + } + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + return true; + default: + ; + } + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_SCALE: + case GGML_OP_SOFT_MAX: + case GGML_OP_RMS_NORM: + case GGML_OP_NORM: + case GGML_OP_ROPE: + return true; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + break; + default: + return false; + } + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + break; + default: + return false; + } + return true; + case GGML_OP_DIAG_MASK_INF: + return op->ne[3] == 1; + case GGML_OP_GET_ROWS: + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q6_K: + return op->ne[3] == 1; + default: + ; + } + return false; + case GGML_OP_MUL_MAT: + if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1])) + return false; + + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q6_K: + return true; + default: + ; + } + default: + ; + } + return false; +} + +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { + const int n_seq = 8; + + // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting + // it to the size of the graph, but I think it can be made smaller? + ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes); + + std::vector> sequences(n_seq); + + for (auto& sequence : sequences) { + sequence = komputeManager()->sequence(); + } + for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { + const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; + + auto& seq = *sequences[seq_idx]; + + const int node_start = (seq_idx + 0) * n_nodes_per_seq; + const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes); + + bool any_commands_recorded = false; + + for (int i = node_start; i < node_end; ++i) { + struct ggml_tensor * src0 = gf->nodes[i]->src[0]; + struct ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct ggml_tensor * dst = gf->nodes[i]; + GGML_ASSERT(dst->data != nullptr); + + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + continue; // noop -> next node + default: + break; + } + + any_commands_recorded = true; + + if (!ggml_kompute_supports_op(dst)) { + fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); + GGML_ASSERT(!"unsupported op"); + } + + const int32_t ne00 = src0 ? src0->ne[0] : 0; + const int32_t ne01 = src0 ? src0->ne[1] : 0; + const int32_t ne02 = src0 ? src0->ne[2] : 0; + const int32_t ne03 = src0 ? src0->ne[3] : 0; + + const uint32_t nb00 = src0 ? src0->nb[0] : 0; + const uint32_t nb01 = src0 ? src0->nb[1] : 0; + const uint32_t nb02 = src0 ? src0->nb[2] : 0; + const uint32_t nb03 = src0 ? src0->nb[3] : 0; + + const int32_t ne10 = src1 ? src1->ne[0] : 0; + const int32_t ne11 = src1 ? src1->ne[1] : 0; + const int32_t ne12 = src1 ? src1->ne[2] : 0; + const int32_t ne13 = src1 ? src1->ne[3] : 0; + + const uint32_t nb10 = src1 ? src1->nb[0] : 0; + const uint32_t nb11 = src1 ? src1->nb[1] : 0; + const uint32_t nb12 = src1 ? src1->nb[2] : 0; + const uint32_t nb13 = src1 ? src1->nb[3] : 0; + + const int32_t ne0 = dst ? dst->ne[0] : 0; + const int32_t ne1 = dst ? dst->ne[1] : 0; + const int32_t ne2 = dst ? dst->ne[2] : 0; +// const int32_t ne3 = dst ? dst->ne[3] : 0; + + const uint32_t nb0 = dst ? dst->nb[0] : 0; + const uint32_t nb1 = dst ? dst->nb[1] : 0; + const uint32_t nb2 = dst ? dst->nb[2] : 0; + const uint32_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + const static std::shared_ptr nullTensor = nullptr; + uint32_t off_src0 = 0; + uint32_t off_src1 = 0; + uint32_t off_dst = 0; + const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; + const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; + const std::shared_ptr& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; + + switch (dst->op) { + case GGML_OP_ADD: + { + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { + // src1 is a row + ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00); + } else { + ggml_vk_add( + seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, + ne00, ne01, ne02, ne03, + nb00, nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb10, nb11, nb12, nb13, + ne0, + nb0, nb1, nb2, nb3 + ); + } + } break; + case GGML_OP_MUL: + { + if (ggml_nelements(src1) == ne10) { + // src1 is a row + ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00); + } else { + ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4); + } + } break; + case GGML_OP_SCALE: + { + float scale; memcpy(&scale, dst->op_params, sizeof(float)); + + ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale); + } break; + case GGML_OP_UNARY: + { + int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); + switch (ggml_get_unary_op(gf->nodes[i])) { + case GGML_UNARY_OP_SILU: + { + ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4); + } break; + case GGML_UNARY_OP_RELU: + { + ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4); + } break; + case GGML_UNARY_OP_GELU: + { + GGML_ASSERT(n % 8 == 0); + ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8); + } break; + default: + { + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } + } + } break; + case GGML_OP_SOFT_MAX: + { + ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03); + } break; + case GGML_OP_DIAG_MASK_INF: + { + const int n_past = ((int32_t *)(dst->op_params))[0]; + ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02); + } break; + case GGML_OP_NORM: + { + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); + } break; + case GGML_OP_RMS_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); + } break; + case GGML_OP_MUL_MAT: + { + if (src1t != GGML_TYPE_F32) { + fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); + goto not_implemented; + } + + if (ggml_is_transposed(src0) || + ggml_is_transposed(src1)) { + fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); + goto not_implemented; + } + + switch (src0t) { + case GGML_TYPE_F32: + ggml_vk_mul_mat_mat_f32(seq, + id_src0, id_src1, id_dst, + off_src0, off_src1, off_dst, + ne00, ne01, ne02, + nb01, nb02, + ne11, ne12, + nb11, nb12, + nb1, nb2); + break; + case GGML_TYPE_F16: + ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); + break; + case GGML_TYPE_Q8_0: + ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1); + break; + case GGML_TYPE_Q4_0: + ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; + case GGML_TYPE_Q4_1: + ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; + case GGML_TYPE_Q6_K: + ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02); + break; + default: { + fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); + goto not_implemented; + } + } + + } break; + case GGML_OP_GET_ROWS: + { + if (src0t == GGML_TYPE_F16) { + ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_Q4_0) { + ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_Q4_1) { + ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_Q6_K) { + ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else { + fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); + goto not_implemented; + } + } break; + case GGML_OP_ROPE: + { + GGML_ASSERT(ne10 == ne02); + GGML_ASSERT(src0t == dstt); + // const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + ggml_vk_rope( + seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, + ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 + ); + } break; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + { + switch (src0t) { + case GGML_TYPE_F32: + { + switch (dstt) { + case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; + case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; + default: goto not_implemented; + } + } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; + case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; + default: goto not_implemented; + } break; + default: goto not_implemented; + } + } + } break; + default: goto not_implemented; + } + continue; + not_implemented: {} + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + //GGML_ASSERT(false); + } + + // Evaluate sequence + if (any_commands_recorded) { + seq.evalAsync(); + } + } + + // Wait for all sequences to finish + for (auto& sequence : sequences) { + if (sequence->isRunning()) + sequence->evalAwait(); + } + + ggml_vk_free_descriptor_pool(ctx); +} + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eFloat; +} + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eUnsignedInt; +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "Kompute"; +} + +static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { + auto * memory = (ggml_vk_memory *)buffer->context; + if (ggml_vk_has_device()) { + ggml_vk_free_memory(*memory); + } + delete memory; +} + +static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { + return ((ggml_vk_memory *)buffer->context)->data; +} + +static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + const auto res = ggml_vk_get_tensor(tensor); + GGML_ASSERT(res); + + memcpy((char *)tensor->data + offset, data, size); + + komputeManager()->sequence()->eval({res}); +} + +static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + const auto res = ggml_vk_get_tensor(tensor); + GGML_ASSERT(res); + + komputeManager()->sequence()->eval({res}); + + memcpy(data, (const char *)tensor->data + offset, size); +} + +static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto * memory = (ggml_vk_memory *)buffer->context; + memset(memory->data, value, buffer->size); + + if (memory->stagingBuffer) + komputeManager()->sequence()->eval(memory->primaryBuffer, memory->stagingBuffer, memory->size); +} + +static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { + /* .get_name = */ ggml_backend_kompute_buffer_get_name, + /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, + /* .get_base = */ ggml_backend_kompute_buffer_get_base, + /* .init_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_kompute_buffer_clear, + /* .reset = */ NULL, +}; + +// default buffer type + +static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "Kompute"; +} + +static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); + return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); +} + +static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + static size_t minStorageBufferOffsetAlignment = 0; + if (minStorageBufferOffsetAlignment == 0) { + GGML_ASSERT(ggml_vk_has_device()); + vk::PhysicalDeviceProperties deviceProperties; + deviceProperties = komputeManager()->physicalDevice()->getProperties(); + vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits; + minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment; + } + + return minStorageBufferOffsetAlignment; +} + +static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + GGML_UNUSED(buft); + return ggml_backend_is_kompute(backend); +} + +ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = { + /* .iface = */ { + /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, + /* .is_host = */ NULL, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_buffer_type_kompute; +} + +// backend + +static const char * ggml_backend_kompute_name(ggml_backend_t backend) { + GGML_UNUSED(backend); + return "Kompute"; +} + +static void ggml_backend_kompute_free(ggml_backend_t backend) { + struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context; + ggml_vk_free(ctx); + ggml_vk_free_device(); + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) { + GGML_UNUSED(backend); + return ggml_backend_kompute_buffer_type(); +} + +static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + auto * ctx = (ggml_kompute_context *)backend->context; + ggml_vk_graph_compute(ctx, cgraph); + return true; +} + +static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + GGML_UNUSED(backend); + return ggml_kompute_supports_op(op); +} + +static struct ggml_backend_i kompute_backend_i = { + /* .get_name = */ ggml_backend_kompute_name, + /* .free = */ ggml_backend_kompute_free, + /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_kompute_graph_compute, + /* .supports_op = */ ggml_backend_kompute_supports_op, +}; + +ggml_backend_t ggml_backend_kompute_init() { + if (!ggml_vk_has_device()) { + fprintf(stderr, "%s: error: device was not initialized\n", __func__); + return nullptr; + } + + struct ggml_kompute_context * ctx = ggml_vk_init(); + + ggml_backend_t kompute_backend = new ggml_backend { + /* .interface = */ kompute_backend_i, + /* .context = */ ctx, + }; + + return kompute_backend; +} + +bool ggml_backend_is_kompute(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_kompute_name; +} + +extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data); + +ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) { + GGML_UNUSED(params); + GGML_UNUSED(user_data); + ggml_vk_init_device(0, "gpu"); + return ggml_backend_kompute_init(); +} diff --git a/ggml-kompute.h b/ggml-kompute.h new file mode 100644 index 000000000..288c835c5 --- /dev/null +++ b/ggml-kompute.h @@ -0,0 +1,69 @@ +#pragma once + +#include "ggml-backend.h" + +#include +#include +#include + +struct ggml_kompute_context; + +namespace vk { + class DeviceMemory; + class Buffer; +}; + +struct ggml_vk_memory { + void *data = nullptr; + size_t size = 0; + vk::DeviceMemory *primaryMemory = nullptr; + vk::Buffer *primaryBuffer = nullptr; + vk::DeviceMemory *stagingMemory = nullptr; + vk::Buffer *stagingBuffer = nullptr; +}; + +struct ggml_vk_device { + int index = 0; + int type = 0; // same as VkPhysicalDeviceType + size_t heapSize = 0; + std::string name; + std::string vendor; + int subgroupSize = 0; +}; + +std::vector ggml_vk_available_devices(size_t memoryRequired); +bool ggml_vk_init_device(size_t memoryRequired, const std::string &device); +bool ggml_vk_init_device(const ggml_vk_device &device); +bool ggml_vk_init_device(int device); +bool ggml_vk_free_device(); +bool ggml_vk_has_vulkan(); +bool ggml_vk_has_device(); +bool ggml_vk_using_vulkan(); +ggml_vk_device ggml_vk_current_device(); +struct ggml_kompute_context * ggml_vk_init(void); +void ggml_vk_free(struct ggml_kompute_context * ctx); +void ggml_vk_free_memory(ggml_vk_memory &memory); + +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf); + +// +// backend API +// user-code should use only these functions +// + +#ifdef __cplusplus +extern "C" { +#endif + +// forward declaration +typedef struct ggml_backend * ggml_backend_t; + +GGML_API ggml_backend_t ggml_backend_kompute_init(void); + +GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml-metal.m b/ggml-metal.m index 6c2a8d04e..547b618b4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2482,10 +2482,10 @@ static void ggml_backend_metal_free_device(void) { } } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { - struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; +static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { + return "Metal"; - return ctx->all_data; + UNUSED(buffer); } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -2503,6 +2503,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; +} + static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); @@ -2515,13 +2521,13 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); UNUSED(buffer); @@ -2534,6 +2540,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { + /* .get_name = */ ggml_backend_metal_buffer_get_name, /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -2542,10 +2549,17 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_metal_buffer_clear, + /* .reset = */ NULL, }; // default buffer type +static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "Metal"; + + UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2618,6 +2632,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -2641,6 +2656,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t) data % size_page; + data = (void *) ((char *) data - offs); + size += offs; + } + size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); @@ -2741,7 +2764,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct UNUSED(backend); } -static struct ggml_backend_i metal_backend_i = { +static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, @@ -2767,7 +2790,7 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { - /* .interface = */ metal_backend_i, + /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; @@ -2775,7 +2798,7 @@ ggml_backend_t ggml_backend_metal_init(void) { } bool ggml_backend_is_metal(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_metal_name; + return backend && backend->iface.get_name == ggml_backend_metal_name; } void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 496f9cdca..cfa766eb1 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-opencl.h" +#include "ggml-backend-impl.h" #include #include @@ -10,7 +11,7 @@ #include #include -#define CL_TARGET_OPENCL_VERSION 110 +#define CL_TARGET_OPENCL_VERSION 120 #include #if defined(_MSC_VER) @@ -929,6 +930,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co } void ggml_cl_init(void) { + static bool initialized = false; + if (initialized) { + return; + } + cl_int err; struct cl_device; @@ -1483,8 +1489,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1501,7 +1507,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + if (src1->backend == GGML_BACKEND_CPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + } CL_CHECK(clFinish(queue)); @@ -1522,8 +1530,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + if (dst->backend == GGML_BACKEND_CPU) { + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + } } } } @@ -1532,8 +1542,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr if (src0->backend != GGML_BACKEND_GPU) { ggml_cl_pool_free(d_X, x_size); } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + if (src1->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_Y, y_size); + } + if (dst->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_D, d_size); + } } static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { @@ -1598,6 +1612,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); } + // FIXME: convert on device + for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // convert src1 to fp16 // TODO: use multiple threads @@ -1643,11 +1659,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - ggml_fp16_to_fp32_row(tmp, d, d_ne); + if (dst->backend == GGML_BACKEND_CPU) { + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } else { + // FIXME: convert dst to fp32 on device + } } } } @@ -1801,7 +1819,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; @@ -1895,3 +1913,292 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { tensor->extra = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } + +// ggml-backend + +// buffer + +struct ggml_backend_opencl_buffer_context { + ~ggml_backend_opencl_buffer_context() { + if (buffer) { + clReleaseMemObject(buffer); + } + for (auto * sub_buffer : sub_buffers) { + clReleaseMemObject(sub_buffer); + } + } + + cl_mem buffer; + std::vector sub_buffers; +}; + +static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; + +static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) { + return "OpenCL"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { + return cl_ptr_base; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + tensor->extra = tensor->view_src->extra; + } else { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)}; + cl_int err; + cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + ctx->sub_buffers.push_back(sub_buffer); + tensor->extra = sub_buffer; + } + tensor->backend = GGML_BACKEND_GPU; +} + +static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); +} + +static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + clReleaseMemObject(sub_buffer); + } + ctx->sub_buffers.clear(); +} + +static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_get_name, + /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer, + /* .get_base = */ ggml_backend_opencl_buffer_get_base, + /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_opencl_buffer_clear, + /* .reset = */ ggml_backend_opencl_buffer_reset, +}; + +// buffer type + +static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) { + return "OpenCL"; + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_cl_init(); + + cl_int err; + cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); + return nullptr; + } + + ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}}; + + return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); +} + +static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { + // FIXME: not thread safe, device may not be initialized yet + static cl_uint alignment = -1; + if (alignment == (cl_uint)-1) { + ggml_cl_init(); + clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); + } + return alignment; + + GGML_UNUSED(buffer_type); +} + +static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) { + //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend + return ggml_backend_is_cpu(backend); + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, + /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + + +ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { + static ggml_backend_buffer_type buffer_type = { + /* .iface = */ ggml_backend_opencl_buffer_type_interface, + /* .context = */ nullptr, + }; + + return &buffer_type; +} + +#if 0 +// host buffer type + +static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CL_Host"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CL_Host"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cl_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cl_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_opencl_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_opencl_buffer_type_host; +} + +// backend + +static const char * ggml_backend_opencl_name(ggml_backend_t backend) { + return "OpenCL"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_opencl_free(ggml_backend_t backend) { + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_opencl_buffer_type(); + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { + for (int i = 0; i < graph->n_nodes; ++i) { + ggml_tensor * node = graph->nodes[i]; + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); + break; + case GGML_OP_MUL: + ggml_cl_mul(node->src[0], node->src[1], node); + break; + default: + GGML_ASSERT(false); + } + } + + return true; + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_can_mul_mat(op->src[0], op->src[1], op); + case GGML_OP_MUL: + // return ggml_can_repeat_rows(op->src[1], op->src[0]); + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +static ggml_backend_i opencl_backend_i = { + /* .get_name = */ ggml_backend_opencl_name, + /* .free = */ ggml_backend_opencl_free, + /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_opencl_graph_compute, + /* .supports_op = */ ggml_backend_opencl_supports_op, +}; + +ggml_backend_t ggml_backend_opencl_init() { + ggml_backend_t backend = new ggml_backend { + /* .interface = */ opencl_backend_i, + /* .context = */ nullptr + }; + + return backend; +} + +bool ggml_backend_is_opencl(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_opencl_name; +} +#endif diff --git a/ggml-opencl.h b/ggml-opencl.h index 44d05bd64..919b00d63 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef __cplusplus extern "C" { @@ -9,17 +10,26 @@ extern "C" { GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -GGML_API void * ggml_cl_host_malloc(size_t size); -GGML_API void ggml_cl_host_free(void * ptr); +// GGML_API void * ggml_cl_host_malloc(size_t size); +// GGML_API void ggml_cl_host_free(void * ptr); GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +// backend API + +// GGML_API ggml_backend_t ggml_backend_opencl_init(void); + +// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); +// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); + #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index adb387100..1027fabdb 100644 --- a/ggml.c +++ b/ggml.c @@ -2336,6 +2336,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } void ggml_free(struct ggml_context * ctx) { + if (ctx == NULL) { + return; + } + // make this function thread safe ggml_critical_section_start(); @@ -4351,6 +4355,23 @@ struct ggml_tensor * ggml_cpy_inplace( return ggml_cpy_impl(ctx, a, b, true); } +struct ggml_tensor * ggml_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_type type) { + bool is_node = false; + + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); + ggml_format_name(result, "%s (copy)", a->name); + + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = result; + + return result; +} + // ggml_cont static struct ggml_tensor * ggml_cont_impl( @@ -14851,7 +14872,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso return i; } -static struct ggml_hash_set ggml_hash_set_new(size_t size) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; @@ -16600,7 +16621,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return GGML_EXIT_SUCCESS; } -struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } @@ -16662,14 +16683,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { + cur = 0; const struct ggml_tensor * src0 = node->src[2]; const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } const int n_as = ggml_get_op_params_i32(node, 1); - cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; diff --git a/ggml.h b/ggml.h index c55e598b4..2013a73d1 100644 --- a/ggml.h +++ b/ggml.h @@ -1167,6 +1167,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_type type); + // make contiguous GGML_API struct ggml_tensor * ggml_cont( struct ggml_context * ctx, @@ -1849,8 +1854,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data diff --git a/kompute b/kompute new file mode 160000 index 000000000..4565194ed --- /dev/null +++ b/kompute @@ -0,0 +1 @@ +Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306 diff --git a/kompute-shaders/common.comp b/kompute-shaders/common.comp new file mode 100644 index 000000000..0df6db7d0 --- /dev/null +++ b/kompute-shaders/common.comp @@ -0,0 +1,97 @@ +#extension GL_EXT_shader_16bit_storage: require +#extension GL_EXT_shader_8bit_storage: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_EXT_shader_explicit_arithmetic_types_int8: require +#extension GL_EXT_shader_explicit_arithmetic_types_int16: require +#extension GL_EXT_control_flow_attributes: enable +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_debug_printf : enable + +#define QK4_0 32 +#define QK4_1 32 + +#define GELU_COEF_A 0.044715 +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 +#define TWOPI_F 6.283185307179586f + +#define QK_K 256 + +#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) +#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) +#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) +#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) + +#define sizeof_block_q4_0 0x12 +struct block_q4_0 { + float16_t d; + uint8_t qs[QK4_0 / 2]; +}; +mat4 dequantize_q4_0(const block_q4_0 xb, uint il) { + const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; + const float d2 = d1 / 256.f; + const float md = -8.f * xb.d; + const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); + const uint16_t mask1 = mask0 << 8; + + mat4 reg; + for (int i=0;i<8;i++) { + uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); + reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md; + } + return reg; +} + +#define sizeof_block_q4_1 0x14 +struct block_q4_1 { + float16_t d; + float16_t m; + uint8_t qs[QK4_1 / 2]; +}; +mat4 dequantize_q4_1(const block_q4_1 xb, uint il) { + const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; + const float d2 = d1 / 256.f; + const float m = xb.m; + const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); + const uint16_t mask1 = mask0 << 8; + + mat4 reg; + for (int i=0;i<8;i++) { + uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); + reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m; + } + return reg; +} + +#define sizeof_block_q6_k 210 +struct block_q6_k { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + float16_t d; // super-block scale +}; +mat4 dequantize_q6_k(const block_q6_k xb, uint il) { + const float16_t d_all = xb.d; + uint8_t ql[QK_K/2]; + uint8_t qh[QK_K/4]; + int8_t scales[QK_K/16]; + + const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + const uint qhIndex = 32*(il/8) + 16*(il&1); + float16_t sc = xb.scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; + + const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3); + const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F); + const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f); + const float16_t ml = float16_t(d_all * sc * 32.f); + const float16_t dl = float16_t(d_all * sc * coef); + mat4 reg; + for (int i = 0; i < 16; ++i) { + const float16_t q = (il&1) != 0 ? ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 2)) + : ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 4)); + reg[i/4][i%4] = dl * q - ml; + } + return reg; +} diff --git a/kompute-shaders/op_add.comp b/kompute-shaders/op_add.comp new file mode 100644 index 000000000..b7b76a79d --- /dev/null +++ b/kompute-shaders/op_add.comp @@ -0,0 +1,58 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1024) in; + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb00; + int nb01; + int nb02; + int nb03; + int ne10; + int ne11; + int ne12; + int ne13; + int nb10; + int nb11; + int nb12; + int nb13; + int ne0; + int nb0; + int nb1; + int nb2; + int nb3; + //int offs; // TODO: needed for GGML_OP_ACC, see metal code +} pcs; + +// general-purpose kernel for addition of two tensors +// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 +// cons: not very efficient +void main() { + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const uint i13 = i03 % pcs.ne13; + const uint i12 = i02 % pcs.ne12; + const uint i11 = i01 % pcs.ne11; + + int offs = 0; // TMP (see above) + + uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4); + uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4); + uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4); + + for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { + const uint i10 = i0 % pcs.ne10; + out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10]; + } +} diff --git a/kompute-shaders/op_addrow.comp b/kompute-shaders/op_addrow.comp new file mode 100644 index 000000000..2376a6b8f --- /dev/null +++ b/kompute-shaders/op_addrow.comp @@ -0,0 +1,25 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inBOff; + uint outOff; + uint row; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 4; + + for (uint x = 0; x < 4; x++) { + const uint i = baseIndex + x; + out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff]; + } +} diff --git a/kompute-shaders/op_cpy_f16_f16.comp b/kompute-shaders/op_cpy_f16_f16.comp new file mode 100644 index 000000000..d57247d2d --- /dev/null +++ b/kompute-shaders/op_cpy_f16_f16.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "common.comp" + +#define IN_TYPE float16_t +#define IN_TYPE_SIZE 2 +#define OUT_TYPE float16_t +#define OUT_TYPE_SIZE 2 + +layout(local_size_x = 1024) in; + +layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; +layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; + +layout (push_constant) uniform parameter { + uint inOff; + uint outOff; + int ne00; + int ne01; + int ne02; + uint nb00; + uint nb01; + uint nb02; + uint nb03; + int ne0; + int ne1; + int ne2; + uint nb0; + uint nb1; + uint nb2; + uint nb3; +} pcs; + +void main() { + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; + + const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); + const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); + const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; + const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); + + const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ + + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ + out_[dst_data+i00] = OUT_TYPE(in_[src]); + } +} diff --git a/kompute-shaders/op_cpy_f16_f32.comp b/kompute-shaders/op_cpy_f16_f32.comp new file mode 100644 index 000000000..b568bcd7b --- /dev/null +++ b/kompute-shaders/op_cpy_f16_f32.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "common.comp" + +#define IN_TYPE float16_t +#define IN_TYPE_SIZE 2 +#define OUT_TYPE float +#define OUT_TYPE_SIZE 4 + +layout(local_size_x = 1024) in; + +layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; +layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; + +layout (push_constant) uniform parameter { + uint inOff; + uint outOff; + int ne00; + int ne01; + int ne02; + uint nb00; + uint nb01; + uint nb02; + uint nb03; + int ne0; + int ne1; + int ne2; + uint nb0; + uint nb1; + uint nb2; + uint nb3; +} pcs; + +void main() { + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; + + const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); + const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); + const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; + const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); + + const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ + + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ + out_[dst_data+i00] = OUT_TYPE(in_[src]); + } +} diff --git a/kompute-shaders/op_cpy_f32_f16.comp b/kompute-shaders/op_cpy_f32_f16.comp new file mode 100644 index 000000000..99b228343 --- /dev/null +++ b/kompute-shaders/op_cpy_f32_f16.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "common.comp" + +#define IN_TYPE float +#define IN_TYPE_SIZE 4 +#define OUT_TYPE float16_t +#define OUT_TYPE_SIZE 2 + +layout(local_size_x = 1024) in; + +layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; +layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; + +layout (push_constant) uniform parameter { + uint inOff; + uint outOff; + int ne00; + int ne01; + int ne02; + uint nb00; + uint nb01; + uint nb02; + uint nb03; + int ne0; + int ne1; + int ne2; + uint nb0; + uint nb1; + uint nb2; + uint nb3; +} pcs; + +void main() { + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; + + const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); + const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); + const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; + const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); + + const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ + + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ + out_[dst_data+i00] = OUT_TYPE(in_[src]); + } +} diff --git a/kompute-shaders/op_cpy_f32_f32.comp b/kompute-shaders/op_cpy_f32_f32.comp new file mode 100644 index 000000000..2fc998492 --- /dev/null +++ b/kompute-shaders/op_cpy_f32_f32.comp @@ -0,0 +1,52 @@ +#version 450 + +#include "common.comp" + +#define IN_TYPE float +#define IN_TYPE_SIZE 4 +#define OUT_TYPE float +#define OUT_TYPE_SIZE 4 + +layout(local_size_x = 1024) in; + +layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; +layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; + +layout (push_constant) uniform parameter { + uint inOff; + uint outOff; + int ne00; + int ne01; + int ne02; + uint nb00; + uint nb01; + uint nb02; + uint nb03; + int ne0; + int ne1; + int ne2; + uint nb0; + uint nb1; + uint nb2; + uint nb3; +} pcs; + +void main() { + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; + + const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); + const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); + const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; + const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); + + const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ + + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ + out_[dst_data+i00] = OUT_TYPE(in_[src]); + } +} diff --git a/kompute-shaders/op_diagmask.comp b/kompute-shaders/op_diagmask.comp new file mode 100644 index 000000000..291c3fc18 --- /dev/null +++ b/kompute-shaders/op_diagmask.comp @@ -0,0 +1,30 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + uint n_past; + int ne00; + int ne01; +} pcs; + +void main() { + const uint i02 = gl_WorkGroupID.z; + const uint i01 = gl_WorkGroupID.y; + const uint i00 = gl_WorkGroupID.x; + + const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00; + + if (i00 > pcs.n_past + i01) { + out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000); + } else { + out_[index + pcs.outOff] = in_[index + pcs.inOff]; + } +} diff --git a/kompute-shaders/op_gelu.comp b/kompute-shaders/op_gelu.comp new file mode 100644 index 000000000..5b547f414 --- /dev/null +++ b/kompute-shaders/op_gelu.comp @@ -0,0 +1,22 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 8; + + for (uint x = 0; x < 8; x++) { + const uint i = baseIndex + x; + const float y = in_[i + pcs.inOff]; + out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y))); + } +} diff --git a/kompute-shaders/op_getrows.comp b/kompute-shaders/op_getrows.comp new file mode 100644 index 000000000..1a5581b23 --- /dev/null +++ b/kompute-shaders/op_getrows.comp @@ -0,0 +1,17 @@ +void main() { + const uint i = gl_WorkGroupID.x; + const int r = inB[i + pcs.inBOff]; + + int z = 0; + for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) { + const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK; + const mat4 result = dequantize_block(inIndex, ind%NL); + for (uint j = 0; j < 4; ++j) { + for (uint k = 0; k < 4; ++k) { + const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z; + out_[outIndex] = result[j][k]; + ++z; + } + } + } +} diff --git a/kompute-shaders/op_getrows_f16.comp b/kompute-shaders/op_getrows_f16.comp new file mode 100644 index 000000000..34acbcd70 --- /dev/null +++ b/kompute-shaders/op_getrows_f16.comp @@ -0,0 +1,31 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { + for (int j = 0; j < k; j++) { + out_[y + j] = inA[x + j]; + } +} + +void main() { + const uint i = gl_WorkGroupID.x; + const int r = inB[i + pcs.inBOff]; + + dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1 + pcs.outOff, pcs.ne00); +} diff --git a/kompute-shaders/op_getrows_q4_0.comp b/kompute-shaders/op_getrows_q4_0.comp new file mode 100644 index 000000000..32b2e891e --- /dev/null +++ b/kompute-shaders/op_getrows_q4_0.comp @@ -0,0 +1,38 @@ +#version 450 + +#include "common.comp" + +#define NL 2 +#define BYTES_FOR_TYPE 4 /*bytes for float*/ +#define SIZE_OF_BLOCK sizeof_block_q4_0 + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +block_q4_0 get_unaligned_block_q4_0(uint index) { + block_q4_0 fres; + fres.d = u8BufToFloat16(inA, index); + [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) { + fres.qs[it] = inA[index+2+it]; + } + return fres; +} + +mat4 dequantize_block(uint index, uint il) { + const block_q4_0 block = get_unaligned_block_q4_0(index); + return dequantize_q4_0(block, il); +} + +#include "op_getrows.comp" diff --git a/kompute-shaders/op_getrows_q4_1.comp b/kompute-shaders/op_getrows_q4_1.comp new file mode 100644 index 000000000..87f2fbe17 --- /dev/null +++ b/kompute-shaders/op_getrows_q4_1.comp @@ -0,0 +1,39 @@ +#version 450 + +#include "common.comp" + +#define NL 2 +#define BYTES_FOR_TYPE 4 /*bytes for float*/ +#define SIZE_OF_BLOCK sizeof_block_q4_1 + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +block_q4_1 get_unaligned_block_q4_1(uint index) { + block_q4_1 fres; + fres.d = u8BufToFloat16(inA, index); + fres.m = u8BufToFloat16(inA, index+2); + [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { + fres.qs[it] = inA[index+4+it]; + } + return fres; +} + +mat4 dequantize_block(uint index, uint il) { + const block_q4_1 block = get_unaligned_block_q4_1(index); + return dequantize_q4_1(block, il); +} + +#include "op_getrows.comp" diff --git a/kompute-shaders/op_getrows_q6_k.comp b/kompute-shaders/op_getrows_q6_k.comp new file mode 100644 index 000000000..9ce3545d1 --- /dev/null +++ b/kompute-shaders/op_getrows_q6_k.comp @@ -0,0 +1,44 @@ +#version 450 + +#include "common.comp" + +#define NL 16 +#define BYTES_FOR_TYPE 4 /*bytes for float*/ +#define SIZE_OF_BLOCK sizeof_block_q6_k + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +block_q6_k get_unaligned_block_q6_k(uint index) { + block_q6_k fres; + [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { + fres.ql[it] = inA[index + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { + fres.qh[it] = inA[index + QK_K/2 + it]; + } + [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { + fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); + } + fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); + return fres; +} + +mat4 dequantize_block(uint index, uint il) { + const block_q6_k block = get_unaligned_block_q6_k(index); + return dequantize_q6_k(block, il); +} + +#include "op_getrows.comp" diff --git a/kompute-shaders/op_mul.comp b/kompute-shaders/op_mul.comp new file mode 100644 index 000000000..d599460c3 --- /dev/null +++ b/kompute-shaders/op_mul.comp @@ -0,0 +1,24 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inBOff; + uint outOff; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 4; + + for (uint x = 0; x < 4; x++) { + const uint i = baseIndex + x; + out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff]; + } +} \ No newline at end of file diff --git a/kompute-shaders/op_mul_mat_f16.comp b/kompute-shaders/op_mul_mat_f16.comp new file mode 100644 index 000000000..dd1e13979 --- /dev/null +++ b/kompute-shaders/op_mul_mat_f16.comp @@ -0,0 +1,48 @@ +#version 450 + +#include "common.comp" + +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout(local_size_x_id = 0) in; + +layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + uint nb01; + uint nb02; + uint nb11; + uint nb12; + uint ne02; + uint ne12; + int ne0; + int ne1; +} pcs; + +void main() { + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + + uint bc_ab = pcs.ne12 > pcs.ne02 ? im / (pcs.ne12 / pcs.ne02) : im; + uint bc_ba = pcs.ne02 > pcs.ne12 ? im / (pcs.ne02 / pcs.ne12) : im; + + const uint x = (r0*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA + const uint y = (r1*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB + + float sumf = 0.0f; + for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { + sumf += float(inA[x+i]) * float(inB[y+i]); + } + + const float all_sum = subgroupAdd(sumf); + if (subgroupElect()) { + out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; + } +} diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/kompute-shaders/op_mul_mat_mat_f32.comp new file mode 100644 index 000000000..6cc5558b2 --- /dev/null +++ b/kompute-shaders/op_mul_mat_mat_f32.comp @@ -0,0 +1,51 @@ +#version 450 + +#include "common.comp" + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_debug_printf : enable + +// device subgroup size +layout (local_size_x_id = 0) in; + +layout(binding = 0) readonly buffer tensorInA { float inA[]; }; +layout(binding = 1) readonly buffer tensorInB { float inB[]; }; +layout(binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout(push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne01; + int ne02; + int ne11; + int ne12; + uint nb01; + uint nb02; + uint nb11; + uint nb12; + uint nb1; + uint nb2; +} +pcs; + + +void main() { + uvec3 gid = gl_WorkGroupID; + + uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z; + uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z; + + const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA + const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB + float sum = 0.0f; + for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { + sum += float(inA[x+i]) * float(inB[y+i]); + } + + const float all_sum = subgroupAdd(sum); + if (subgroupElect()) { + out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum; + } +} \ No newline at end of file diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/kompute-shaders/op_mul_mat_q4_0.comp new file mode 100644 index 000000000..03788c920 --- /dev/null +++ b/kompute-shaders/op_mul_mat_q4_0.comp @@ -0,0 +1,51 @@ +#version 450 + +#include "common.comp" + +#define BLOCKS_IN_QUANT QK4_0 +#define SIZE_OF_BLOCK sizeof_block_q4_0 +#define N_ROWS 4 + +layout(local_size_x_id = 0) in; +layout(local_size_y = 1) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int gqa; +} pcs; + +// The q4_0 version of this function +float block_q_n_dot_y(uint block_index, uint yb, uint il) { + vec2 acc = vec2(0.0, 0.0); + const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; + float d = float(u8BufToFloat16(inA, index)); + float sumy = 0.0f; + for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { + const uint16_t b = u8BufToU16(inA, index + 2 + il + i); + + const float yl0 = inB[yb + i]; + const float yl1 = inB[yb + i + 1]; + const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; + const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; + + sumy += yl0 + yl1 + yl8 + yl9; + + acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); + acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); + } + return d * (sumy * -8.f + acc[0] + acc[1]); +} + +#include "op_mul_mv_q_n.comp" diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/kompute-shaders/op_mul_mat_q4_1.comp new file mode 100644 index 000000000..0ae8f8c7d --- /dev/null +++ b/kompute-shaders/op_mul_mat_q4_1.comp @@ -0,0 +1,53 @@ +#version 450 + +#include "common.comp" + +#define BLOCKS_IN_QUANT QK4_1 +#define SIZE_OF_BLOCK sizeof_block_q4_1 +#define N_ROWS 4 + +layout(local_size_x_id = 0) in; +layout(local_size_y = 1) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int gqa; +} pcs; + +// The q4_1 version of this function +float block_q_n_dot_y(uint block_index, uint yb, uint il) { + vec2 acc = vec2(0.0, 0.0); + const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; + float d = float(u8BufToFloat16(inA, index)); + float m = float(u8BufToFloat16(inA, index+2)); + + float sumy = 0.0f; + for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { + const uint16_t b = u8BufToU16(inA, index + 4 + il + i); + + const float yl0 = inB[yb + i]; + const float yl1 = inB[yb + i + 1]; + const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; + const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; + + sumy += yl0 + yl1 + yl8 + yl9; + + acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); + acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); + } + return d * (acc[0] + acc[1]) + sumy * m; +} + +#include "op_mul_mv_q_n.comp" diff --git a/kompute-shaders/op_mul_mat_q6_k.comp b/kompute-shaders/op_mul_mat_q6_k.comp new file mode 100644 index 000000000..c9baebdf4 --- /dev/null +++ b/kompute-shaders/op_mul_mat_q6_k.comp @@ -0,0 +1,94 @@ +#version 450 + +#include "common.comp" + +#define SIZE_OF_BLOCK sizeof_block_q6_k + +layout(local_size_x_id = 0) in; +layout(local_size_y_id = 1) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int gqa; +} pcs; + +void main() { + const uint8_t kmask1 = uint8_t(0x03); + const uint8_t kmask2 = uint8_t(0x0C); + const uint8_t kmask3 = uint8_t(0x30); + const uint8_t kmask4 = uint8_t(0xC0); + + const uint nb = pcs.ne00/QK_K; + + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint r2 = gl_WorkGroupID.z; + + const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); + const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0); + const uint x = row * nb + offset0; // Based from inA without base offset + const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + float sumf = 0; + + // bits of invocation ID for gl_SubgroupSize=32: + // x x x x x + // 4 3 2 1 0 + // ( tid ) ix + // ip ( il ) + + const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes + const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0 + const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1 + const uint ip = tid/8; // first or second half of block (0 or 1) + const uint il = tid%8; // each half has 8 parts, one per scale + const uint n = 4; // 4 scales at a time (and 4 sums) + const uint l0 = n*il; // offset into half-block, 0..28 + const uint is = 8*ip + l0/16; // 0, 1, 8, 9 + + const uint y_offset = 128*ip + l0; + const uint q_offset_l = 64*ip + l0; + const uint q_offset_h = 32*ip + l0; + + for (uint i = ix; i < nb; i += block_stride) { + + const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff; + + const uint qlIndex = q_offset_l; + const uint q2Index = qlIndex + QK_K/8; + const uint qhIndex = q_offset_h; + const uint y = yy + i * QK_K + y_offset; + + float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (uint l = 0; l < n; ++l) { + const uint8_t currentQ1 = inA[baseIndex + qlIndex + l]; + const uint8_t currentQ2 = inA[baseIndex + q2Index + l]; + const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l]; + + sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32); + sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32); + sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32); + sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32); + } + + float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16); + sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is])); + } + + const float tot = subgroupAdd(sumf); + if (subgroupElect()) { + out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; + } +} diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/kompute-shaders/op_mul_mat_q8_0.comp new file mode 100644 index 000000000..1c4ddbb08 --- /dev/null +++ b/kompute-shaders/op_mul_mat_q8_0.comp @@ -0,0 +1,56 @@ +#version 450 + +#include "common.comp" + +#define BLOCKS_IN_QUANT QK8_0 +#define SIZE_OF_BLOCK sizeof_block_q8_0 +#define N_ROWS 4 + +layout(local_size_x_id = 0) in; +layout(local_size_y = 1) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int gqa; +} pcs; + +#define ELS_PER_BLOCK 32 +#define SIZE_OF_D 2 +#define BLOCK_SIZE (ELS_PER_BLOCK + SIZE_OF_D) + +void main() { + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + + const uint x = r0 * (pcs.ne00/ELS_PER_BLOCK) * BLOCK_SIZE + pcs.inAOff; // Based from inA + const uint y = r1 * pcs.ne10 + pcs.inBOff; // based from inB + + float sumf = 0.0f; + for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { + const uint block_number = i / ELS_PER_BLOCK; + const uint block_offset = block_number * BLOCK_SIZE; + const float d = u8BufToFloat16(inA, x + block_offset); + const uint position_in_block = i % ELS_PER_BLOCK; + const int q = int8_t(inA[x+block_offset+SIZE_OF_D+position_in_block]); + const float dq = d * q; + sumf += dq * float(inB[y+i]); + } + + const float all_sum = subgroupAdd(sumf); + if (subgroupElect()) { + out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; + } +} diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/kompute-shaders/op_mul_mv_q_n.comp new file mode 100644 index 000000000..8b6e6a2e2 --- /dev/null +++ b/kompute-shaders/op_mul_mv_q_n.comp @@ -0,0 +1,41 @@ +void main() { + if (gl_SubgroupInvocationID > 31) + return; + + const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT); + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS; + const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0); + + const uint x = offset0; // Based from inA without base offset + const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; + + const uint ix = gl_SubgroupInvocationID/2; + const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2); + + uint yb = y + ix * BLOCKS_IN_QUANT + il; + + //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n", + // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize, + // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z); + + for (uint ib = ix; ib < nb; ib += 16) { + for (int row = 0; row < N_ROWS; row++) { + const uint block_index = x + ib + row * nb; + sumf[row] += block_q_n_dot_y(block_index, yb, il); + } + + yb += BLOCKS_IN_QUANT * 16; + } + + for (int row = 0; row < N_ROWS; ++row) { + const float tot = subgroupAdd(sumf[row]); + if (first_row + row < pcs.ne01 && subgroupElect()) { + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot; + } + } +} diff --git a/kompute-shaders/op_mulrow.comp b/kompute-shaders/op_mulrow.comp new file mode 100644 index 000000000..ae7106320 --- /dev/null +++ b/kompute-shaders/op_mulrow.comp @@ -0,0 +1,25 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inAOff; + uint inBOff; + uint outOff; + uint row; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 4; + + for (uint x = 0; x < 4; x++) { + const uint i = baseIndex + x; + out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff]; + } +} \ No newline at end of file diff --git a/kompute-shaders/op_norm.comp b/kompute-shaders/op_norm.comp new file mode 100644 index 000000000..ad0c3c01b --- /dev/null +++ b/kompute-shaders/op_norm.comp @@ -0,0 +1,84 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 256) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + uint ne00; + uint nb01; + float eps; +} pcs; + +shared float sum[gl_WorkGroupSize.x]; + +void main() { + const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ + // MEAN + // parallel sum + sum[gl_LocalInvocationID.x] = 0.0; + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + sum[gl_LocalInvocationID.x] += in_[x+i00]; + } + + // reduce + barrier(); + memoryBarrierShared(); + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { + if (gl_LocalInvocationID.x < i) { + sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; + } + barrier(); + memoryBarrierShared(); + } + + // broadcast + if (gl_LocalInvocationID.x == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + const float mean = sum[0]; + + // recenter + const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + out_[y+i00] = in_[x+i00] - mean; + } + + // VARIANCE + // parallel sum + sum[gl_LocalInvocationID.x] = 0.0; + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00]; + } + + // reduce + barrier(); + memoryBarrierShared(); + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { + if (gl_LocalInvocationID.x < i) { + sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; + } + barrier(); + memoryBarrierShared(); + } + + // broadcast + if (gl_LocalInvocationID.x == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + const float variance = sum[0]; + + const float scale = 1.0f/sqrt(variance + pcs.eps); + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + out_[y+i00] *= scale; + } +} diff --git a/kompute-shaders/op_relu.comp b/kompute-shaders/op_relu.comp new file mode 100644 index 000000000..52a601fe6 --- /dev/null +++ b/kompute-shaders/op_relu.comp @@ -0,0 +1,21 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 4; + + for (uint x = 0; x < 4; x++) { + const uint i = baseIndex + x; + out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]); + } +} diff --git a/kompute-shaders/op_rmsnorm.comp b/kompute-shaders/op_rmsnorm.comp new file mode 100644 index 000000000..da658c160 --- /dev/null +++ b/kompute-shaders/op_rmsnorm.comp @@ -0,0 +1,53 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 512) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + uint ne00; + uint nb01; + float eps; +} pcs; + +shared float sum[gl_WorkGroupSize.x]; + +void main() { + const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ + + // parallel sum + sum[gl_LocalInvocationID.x] = 0.0; + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; + } + + // reduce + barrier(); + memoryBarrierShared(); + [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { + if (gl_LocalInvocationID.x < i) { + sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; + } + barrier(); + memoryBarrierShared(); + } + + // broadcast + if (gl_LocalInvocationID.x == 0) { + sum[0] /= float(pcs.ne00); + } + barrier(); + memoryBarrierShared(); + + const float scale = 1.0f/sqrt(sum[0] + pcs.eps); + + const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ + for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { + out_[y+i00] = in_[x+i00] * scale; + } +} diff --git a/kompute-shaders/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp new file mode 100644 index 000000000..b44622584 --- /dev/null +++ b/kompute-shaders/op_rope_f16.comp @@ -0,0 +1,73 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + const bool is_neox = (pcs.mode & 2) != 0; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + const int p = inB[pcs.inBOff + i2]; + + float theta = float(p); + + if (!is_neox) { + for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) { + float cos_theta, sin_theta; + rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + theta *= theta_scale; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + const float x0 = float(inA[src]); + const float x1 = float(inA[src+1]); + + out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); + out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta); + } + } else { + const float inv_ndims = -1.f/pcs.n_dims; + for (uint ic = 0; ic < pcs.n_dims; ic += 2) { + const uint cur_rot = ic; + + float cos_theta, sin_theta; + rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + theta *= theta_scale; + + const uint i0 = ic/2; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + const float x0 = float(inA[src]); + const float x1 = float(inA[src+pcs.n_dims/2]); + + out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); + out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta); + } + + for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) { + const uint i0 = ic; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ + + out_[dst_data + 0] = inA[src + 0]; + out_[dst_data + 1] = inA[src + 1]; + } + } +} diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp new file mode 100644 index 000000000..2c0235d75 --- /dev/null +++ b/kompute-shaders/op_rope_f32.comp @@ -0,0 +1,73 @@ +#version 450 + +#include "rope_common.comp" + +layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; +layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; +layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; + +void main() { + const uint i3 = gl_WorkGroupID.z; + const uint i2 = gl_WorkGroupID.y; + const uint i1 = gl_WorkGroupID.x; + + const bool is_neox = (pcs.mode & 2) != 0; + + float corr_dims[2]; + rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); + + const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); + + const int p = inB[pcs.inBOff + i2]; + + float theta = float(p); + + if (!is_neox) { + for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) { + float cos_theta, sin_theta; + rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + theta *= theta_scale; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + const float x0 = inA[src]; + const float x1 = inA[src+1]; + + out_[dst_data] = x0*cos_theta - x1*sin_theta; + out_[dst_data+1] = x0*sin_theta + x1*cos_theta; + } + } else { + const float inv_ndims = -1.f/pcs.n_dims; + for (uint ic = 0; ic < pcs.n_dims; ic += 2) { + const uint cur_rot = ic; + + float cos_theta, sin_theta; + rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); + + theta *= theta_scale; + + const uint i0 = ic/2; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + const float x0 = inA[src]; + const float x1 = inA[src+pcs.n_dims/2]; + + out_[dst_data] = x0*cos_theta - x1*sin_theta; + out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta; + } + + for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) { + const uint i0 = ic; + + const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in + const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ + + out_[dst_data + 0] = inA[src + 0]; + out_[dst_data + 1] = inA[src + 1]; + } + } +} diff --git a/kompute-shaders/op_scale.comp b/kompute-shaders/op_scale.comp new file mode 100644 index 000000000..bdae26738 --- /dev/null +++ b/kompute-shaders/op_scale.comp @@ -0,0 +1,19 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + float scale; +} pcs; + +void main() { + const uint i = gl_WorkGroupID.x; + out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; +} diff --git a/kompute-shaders/op_scale_8.comp b/kompute-shaders/op_scale_8.comp new file mode 100644 index 000000000..ada69754b --- /dev/null +++ b/kompute-shaders/op_scale_8.comp @@ -0,0 +1,23 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + float scale; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 8; + + for (uint x = 0; x < 8; x++) { + const uint i = baseIndex + x; + out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; + } +} diff --git a/kompute-shaders/op_silu.comp b/kompute-shaders/op_silu.comp new file mode 100644 index 000000000..0fb8e4b74 --- /dev/null +++ b/kompute-shaders/op_silu.comp @@ -0,0 +1,22 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; +} pcs; + +void main() { + const uint baseIndex = gl_WorkGroupID.x * 4; + + for (uint x = 0; x < 4; x++) { + const uint i = baseIndex + x; + const float y = in_[i + pcs.inOff]; + out_[i + pcs.outOff] = y / (1.0 + exp(-y)); + } +} diff --git a/kompute-shaders/op_softmax.comp b/kompute-shaders/op_softmax.comp new file mode 100644 index 000000000..89de1b701 --- /dev/null +++ b/kompute-shaders/op_softmax.comp @@ -0,0 +1,51 @@ +// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4) + +#version 450 + +#include "common.comp" + +layout(local_size_x_id = 0) in; + +layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; +layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; + +layout(push_constant) uniform PushConstants { + uint inOff; + uint outOff; + int ne00; + int ne01; + int ne02; +} pcs; + +void main() { + if (gl_SubgroupInvocationID > 31) + return; + + const uint i03 = gl_WorkGroupID.z; + const uint i02 = gl_WorkGroupID.y; + const uint i01 = gl_WorkGroupID.x; + + const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00; + const uint psrc0 = extra_off + pcs.inOff; // Based from in_ + const uint pdst = extra_off + pcs.outOff; // Based from out_ + + // parallel max + float localMax = uintBitsToFloat(0xFF800000); + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { + localMax = max(localMax, in_[psrc0 + i00]); + } + float max_ = subgroupMax(localMax); + + // parallel sum + float localSum = 0.0f; + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { + const float exp_psrc0 = exp(in_[psrc0 + i00] - max_); + localSum += exp_psrc0; + out_[pdst + i00] = exp_psrc0; + } + + const float sum = subgroupAdd(localSum); + for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { + out_[pdst + i00] /= sum; + } +} diff --git a/kompute-shaders/rope_common.comp b/kompute-shaders/rope_common.comp new file mode 100644 index 000000000..57ba6597a --- /dev/null +++ b/kompute-shaders/rope_common.comp @@ -0,0 +1,67 @@ +#include "common.comp" + +// TODO: use a local size of 32 or more (Metal uses 1024) +layout(local_size_x = 1) in; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int n_dims; + int mode; + int n_orig_ctx; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + uint nb00; + uint nb01; + uint nb02; + uint nb03; + int ne0; + uint nb0; + uint nb1; + uint nb2; + uint nb3; +} pcs; + +float rope_yarn_ramp(const float low, const float high, const float i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale, + out float cos_theta, out float sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); + } + cos_theta = cos(theta) * mscale; + sin_theta = sin(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base)); +} + +void rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2] +) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base))); +} diff --git a/llama.cpp b/llama.cpp index e1f1932ba..0588250f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,5 +1,4 @@ #define LLAMA_API_INTERNAL -//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading #include "llama.h" #include "unicode.h" @@ -12,6 +11,8 @@ # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" +#elif defined(GGML_USE_KOMPUTE) +# include "ggml-kompute.h" #endif #ifdef GGML_USE_METAL @@ -152,10 +153,6 @@ static bool is_float_close(float a, float b, float abs_tol) { return std::fabs(b - a) <= abs_tol; } -#ifdef GGML_USE_CPU_HBM -#include -#endif - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -1190,12 +1187,6 @@ struct llama_mlock { #endif }; -typedef void (*offload_func_t)(struct ggml_tensor * tensor); - -static void ggml_offload_nop(struct ggml_tensor * tensor) { - (void) tensor; -} - static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -1211,19 +1202,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } -static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { +static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - buft = ggml_backend_metal_buffer_type(); +#if defined(GGML_USE_CUBLAS) + // host buffers should only be used when data is expected to be copied to/from the GPU + if (host_buffer) { + buft = ggml_backend_cuda_host_buffer_type(); } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (n_gpu_layers > 0) { - buft = ggml_backend_cuda_buffer_type(0); - } -#elif defined(GGML_USE_CUBLAS) - buft = ggml_backend_cuda_host_buffer_type(); #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #endif @@ -1231,10 +1217,47 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { if (buft == nullptr) { buft = ggml_backend_cpu_buffer_type(); } - return buft; - GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(host_buffer); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#elif defined(GGML_USE_CUBLAS) + buft = ggml_backend_cuda_buffer_type(gpu); +#elif defined(GGML_USE_CLBLAST) + buft = ggml_backend_opencl_buffer_type(); +#elif defined(GGML_USE_KOMPUTE) + buft = ggml_backend_kompute_buffer_type(); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + + GGML_UNUSED(gpu); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_CUBLAS + if (ggml_backend_cuda_get_device_count() > 1) { + buft = ggml_backend_cuda_split_buffer_type(tensor_split); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(fallback_gpu); + } + return buft; + + GGML_UNUSED(tensor_split); } // @@ -1445,24 +1468,24 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - struct ggml_context * ctx = NULL; + std::vector ctxs; + std::vector bufs; - ggml_backend_buffer_t buf = NULL; + size_t total_size() const { + size_t size = 0; + for (ggml_backend_buffer_t buf : bufs) { + size += ggml_backend_buffer_get_size(buf); + } + return size; + } ~llama_kv_cache() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < k_l.size(); ++i) { - ggml_cuda_free_data(k_l[i]); - ggml_cuda_free_data(v_l[i]); - } - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; @@ -1539,16 +1562,32 @@ struct llama_model { std::vector layers; + llama_split_mode split_mode; + int main_gpu; int n_gpu_layers; // gguf metadata std::unordered_map gguf_kv; - // context - struct ggml_context * ctx = NULL; + // layer -> buffer type mapping + struct layer_buft { + layer_buft() : buft_matrix(nullptr), buft(nullptr) {} + layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {} + layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {} - // the model memory buffer - ggml_backend_buffer_t buf = NULL; + ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication + ggml_backend_buffer_type_t buft; // everything else + }; + + layer_buft buft_input; + layer_buft buft_output; + std::vector buft_layer; + + // contexts where the model tensors metadata is stored + std::vector ctxs; + + // the model memory buffers for the tensor data + std::vector bufs; // model memory mapped file std::unique_ptr mapping; @@ -1564,39 +1603,32 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cuda_free_data(tensors_by_name[i].second); - } - ggml_cuda_free_scratch(); - } -#endif - -#if defined(GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cl_free_data(tensors_by_name[i].second); - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { - ggml_allocr_free(alloc); - ggml_backend_buffer_free(buf_alloc); - ggml_backend_free(backend); + ggml_backend_sched_free(sched); + + for (ggml_backend_t backend : backends) { + ggml_backend_free(backend); + } } llama_cparams cparams; - ggml_backend_t backend = nullptr; + std::vector backends; +#ifdef GGML_USE_METAL + ggml_backend_t backend_metal = nullptr; +#endif + ggml_backend_t backend_cpu = nullptr; const llama_model & model; @@ -1630,8 +1662,9 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_buffer_t buf_alloc = NULL; - ggml_allocr * alloc = NULL; + ggml_backend_sched_t sched = nullptr; + // allocator for the input tensors + ggml_tallocr * alloc = nullptr; // temporary buffer for copying data to/from the backend std::vector> buf_copy; @@ -1646,16 +1679,17 @@ struct llama_context { // static bool llama_kv_cache_init( - const struct llama_hparams & hparams, struct llama_kv_cache & cache, + const llama_model & model, ggml_type ktype, ggml_type vtype, uint32_t n_ctx, - int n_gpu_layers, bool offload) { + const struct llama_hparams & hparams = model.hparams; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_layer = hparams.n_layer; + const int64_t n_layer = hparams.n_layer; cache.has_shift = false; @@ -1666,62 +1700,65 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - struct ggml_init_params params; - params.mem_size = 2u*n_layer*ggml_tensor_overhead(); - params.mem_buffer = NULL; - params.no_alloc = true; +#ifdef GGML_USE_CLBLAST + offload = false; +#endif - cache.ctx = ggml_init(params); + // count used buffer types + std::map buft_layer_count; + if (offload) { + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + } + } else { + buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + } - size_t vram_kv_cache = 0; - - if (!cache.ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); - return false; + // create a context for each buffer type + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__); + return false; + } + ctx_map[it.first] = ctx; + cache.ctxs.push_back(ctx); } cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; - for (int i = 0; i < (int) n_layer; i++) { - ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx); + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (i >= i_gpu_start) { - if (offload) { - ggml_cuda_assign_buffers_no_scratch(k); - ggml_cuda_assign_buffers_no_scratch(v); - vram_kv_cache += ggml_nbytes(k); - vram_kv_cache += ggml_nbytes(v); - // HACK: mark tensor as allocated - k->data = v->data = (void *)(uintptr_t)1; - } + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; } -#endif // GGML_USE_CUBLAS + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - // allocate tensors - cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers)); - - // buf may be NULL with full offload - if (cache.buf) { - // initialize the buffer to avoid NaNs in the padding - ggml_backend_buffer_clear(cache.buf, 0); - } - - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); - } - - GGML_UNUSED(i_gpu_start); - GGML_UNUSED(offload); - return true; } @@ -2274,6 +2311,9 @@ struct llama_model_loader { use_mmap = false; } +#ifdef GGML_USE_KOMPUTE + use_mmap = false; +#endif this->use_mmap = use_mmap; } @@ -2353,9 +2393,8 @@ struct llama_model_loader { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - tensor->backend = backend; // TODO: ggml_set_backend ggml_set_name(tensor, ggml_get_name(meta)); n_created++; @@ -2363,7 +2402,7 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend, bool required = true) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -2373,12 +2412,6 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } - if (backend == GGML_BACKEND_GPU_SPLIT) { - if (ne.size() == 1) { - throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); - } - } - { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2396,7 +2429,7 @@ struct llama_model_loader { } } - return create_tensor_for(ctx, cur, backend); + return create_tensor_for(ctx, cur); } void done_getting_tensors() const { @@ -2415,26 +2448,36 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping(bool prefetch = true) { - /* - // prefetch only CPU tensors - if (use_mmap) { - size_t size_pref = 0; // prefetch - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (cur->backend == GGML_BACKEND_CPU) { - size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); - size_pref = std::max(size_pref, tensor_end); - } - } - mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); - } - */ + void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { // prefetch the whole file - all the data is needed anyway if (use_mmap) { mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); } + + // compute the total size of all tensors for progress reporting + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + } + + if (use_mmap && mapping) { + if (lmlock) { + lmlock->init(mapping->addr); + } + mmap_used_first = mapping->size; + } + } + + void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { + GGML_ASSERT(mapping); + + *first = mapping->size; + *last = 0; + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const size_t offs = file_offset(ggml_get_name(tensor)); + *first = std::min(*first, offs); + *last = std::max(*last, offs + ggml_nbytes(tensor)); + } } // for backwards compatibility, does not support ggml-backend @@ -2442,8 +2485,11 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); if (use_mmap && mapping) { - GGML_ASSERT(cur->data == nullptr); - cur->data = (uint8_t *)mapping->addr + offs; + if (cur->data == nullptr) { + cur->data = (uint8_t *)mapping->addr + offs; + } else { + memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + } } else { GGML_ASSERT(cur->data != nullptr); file.seek(offs, SEEK_SET); @@ -2451,37 +2497,23 @@ struct llama_model_loader { } } + size_t size_done = 0; + size_t size_data = 0; + size_t mmap_used_first = -1; + size_t mmap_used_last = 0; + // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { - size_t size_data = 0; - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && buf_mmap) { - if (lmlock) { - lmlock->init(mapping->addr); - } - } - -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) - const bool legacy_offload = true; -#else - const bool legacy_offload = false; -#endif + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { + GGML_ASSERT(size_data != 0 && "call init_mapping() first"); std::vector> read_buf; - size_t size_done = 0; - - size_t mmap_first = -1; - size_t mmap_last = 0; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + if (!cur) { + // some tensors may be allocated in a different context + continue; + } if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { @@ -2491,67 +2523,48 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); - if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { - if (use_mmap && mapping) { - if (buf_mmap) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); - } - mmap_first = std::min(mmap_first, offs); - mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); - } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + if (use_mmap && mapping) { + if (buf_mmap && cur->data == nullptr) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + ggml_nbytes(cur)); } + mmap_used_first = std::min(mmap_used_first, offs); + mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); } else { - if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); - } else { - read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); - } + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { - // HACK: mark tensor as allocated - cur->data = (void *)(uintptr_t)1; - void * data; - if (use_mmap && mapping) { - data = (uint8_t *) mapping->addr + offs; + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); file.seek(offs, SEEK_SET); file.read_raw(read_buf.data(), ggml_nbytes(cur)); - data = read_buf.data(); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_transform_tensor(data, cur); -#elif defined(GGML_USE_CLBLAST) - GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); - ggml_cl_transform_tensor(data, cur); -#else - GGML_ASSERT(!"GPU tensor without a GPU backend"); - GGML_UNUSED(data); -#endif } size_done += ggml_nbytes(cur); } - // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_first); - mapping->unmap_fragment(mmap_last, mapping->size); + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_used_first); + if (mmap_used_last != 0) { + mapping->unmap_fragment(mmap_used_last, mapping->size); + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } } - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); - } return true; } }; @@ -3178,6 +3191,7 @@ static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, + enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, bool use_mlock, @@ -3185,702 +3199,563 @@ static bool llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - auto & ctx = model.ctx; auto & hparams = model.hparams; + model.split_mode = split_mode; + model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors; + const int64_t n_layer = hparams.n_layer; + const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + // there is very little benefit to offloading the input layer, so always keep it on the CPU + model.buft_input = llama_default_buffer_type_cpu(true); - // create the ggml context + model.buft_layer.resize(n_layer); + + // assign cpu layers + for (int64_t i = 0; i < i_gpu_start; ++i) { + model.buft_layer[i] = llama_default_buffer_type_cpu(true); + } + +#ifdef GGML_USE_CUBLAS + if (split_mode == LLAMA_SPLIT_LAYER) { + // calculate the split points + int device_count = ggml_backend_cuda_get_device_count(); + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); + float splits[GGML_CUDA_MAX_DEVICES]; + if (all_zero) { + // default split, by free memory + for (int i = 0; i < device_count; ++i) { + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(i, &total, &free); + splits[i] = free; + } + } else { + std::copy(tensor_split, tensor_split + device_count, splits); + } + + // sum and normalize the splits to get the split points + float split_sum = 0.0f; + for (int i = 0; i < device_count; ++i) { + split_sum += splits[i]; + splits[i] = split_sum; + } + for (int i = 0; i < device_count; ++i) { + splits[i] /= split_sum; + } + + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + } + // assign the output layer + if (n_gpu_layers > n_layer) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + model.buft_output = llama_default_buffer_type_offload(layer_gpu); + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } else +#endif { + ggml_backend_buffer_type_t split_buft; + if (split_mode == LLAMA_SPLIT_ROW) { + split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + } else { + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + split_buft = llama_default_buffer_type_offload(main_gpu); + } + // assign the repeating layers + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + model.buft_layer[i] = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } + // assign the output layer + if (n_gpu_layers > n_layer) { + model.buft_output = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } + + // count used buffer types + std::map buft_layer_count; + buft_layer_count[model.buft_input.buft]++; + buft_layer_count[model.buft_input.buft_matrix]++; + buft_layer_count[model.buft_output.buft]++; + buft_layer_count[model.buft_output.buft_matrix]++; + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } + + // create one context per buffer type + size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + std::map ctx_map; + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - throw std::runtime_error(format("ggml_init() failed")); + ggml_context * ctx = ggml_init(params); + if (!ctx) { + throw std::runtime_error(format("failed to create context")); } + ctx_map[it.first] = ctx; + model.ctxs.push_back(ctx); } - (void) main_gpu; - - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; - enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); - - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT; - } -#elif defined(GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU; -#endif + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0); // create tensors for the weights { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int64_t n_layer = hparams.n_layer; + const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = hparams.n_vocab; + const int64_t n_ff = hparams.n_ff; + + GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + + ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); + ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); + ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); + auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); }; + auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; + + model.layers.resize(n_layer); const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false); - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false); - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); if (layer.ffn_gate_inp == nullptr) { GGML_ASSERT(hparams.n_expert == 0); GGML_ASSERT(hparams.n_expert_used == 0); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { GGML_ASSERT(hparams.n_expert > 0); GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); - layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); - layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); + layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_FALCON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); - layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_STARCODER: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); - const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); - layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; case LLM_ARCH_BLOOM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_MPT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); // AWQ ScaleActivation layer - layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } } break; case LLM_ARCH_STABLELM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - /* - llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ] - */ - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_QWEN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - } - - const uint32_t n_ff = hparams.n_ff / 2; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); } } break; case LLM_ARCH_PHI2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PLAMO: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_GPT2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; default: @@ -3890,78 +3765,51 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(); + ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); - // allocate tensors - size_t vram_weights = 0; - size_t buf_size = 0; + // create the backend buffers + std::vector> ctx_bufs; - ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); + for (auto & it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = nullptr; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend - if (t->backend == GGML_BACKEND_CPU) { - buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); - } else { - vram_weights += ggml_nbytes(t); + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size + if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); } - } - - // create backend buffer - ggml_backend_buffer_t buf_mmap = nullptr; - #ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - if (ml.use_mmap) { + else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { const size_t max_size = ggml_get_max_tensor_size(ctx); - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); - buf_mmap = model.buf; - } else { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); } - } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only - if (n_gpu_layers > 0) { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); - } #endif - - if (model.buf == nullptr) { - // CPU backend, and indirectly CUDA and OpenCL - if (ml.use_mmap) { - model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); - buf_mmap = model.buf; - } else { - // allocate only CPU tensors - model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - if (t->backend == GGML_BACKEND_CPU) { - ggml_tallocr_alloc(alloc, t); - } + else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_buf.init (ggml_backend_buffer_get_base(buf)); + model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf)); } - ggml_tallocr_free(alloc); } - } - - if (use_mlock && ggml_backend_buffer_is_host(model.buf)) { - model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); - model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); + if (buf == nullptr) { + throw std::runtime_error("failed to allocate buffer"); + } + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf); + ctx_bufs.emplace_back(ctx, buf); } // print memory requirements { - size_t sys_mem_required = ctx_size + buf_size; - - if (sys_mem_required > 0) { - LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); - } - if (vram_weights > 0) { - LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); - } - -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -3973,23 +3821,26 @@ static bool llm_load_tensors( const int max_offloadable_layers = hparams.n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); -#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_set_tensor_split(tensor_split); -#else - GGML_UNUSED(tensor_split); -#endif // GGML_USE_CUBLAS + for (ggml_backend_buffer_t buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + } + } // populate tensors_by_name - for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); - model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + for (ggml_context * ctx : model.ctxs) { + for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + } } - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { - return false; + // load tensor data + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + ggml_backend_buffer_t buf = it.second; + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } } model.mapping = std::move(ml.mapping); @@ -4001,7 +3852,7 @@ static bool llm_load_tensors( } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -4022,14 +3873,29 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons return 0; } +#ifdef GGML_USE_KOMPUTE + if (ggml_vk_has_device() && params.n_gpu_layers > 0 && ( + !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) + || !( + model.ftype == LLAMA_FTYPE_ALL_F32 || + model.ftype == LLAMA_FTYPE_MOSTLY_F16 || + model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || + model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 + ) + )) { + // disable Vulkan due to unsupported model architecture or quantization type + params.n_gpu_layers = 0; + } +#endif + if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { return -2; } } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1; } @@ -4474,9 +4340,8 @@ struct llm_build_context { n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), cb (cb), - buf_compute_meta (lctx.buf_compute_meta) { - GGML_ASSERT(!!kv_self.ctx); - + buf_compute_meta (lctx.buf_compute_meta) + { // all initializations should be done in init() } @@ -4556,6 +4421,12 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, @@ -6074,199 +5945,13 @@ struct llm_build_context { } }; -// -// tensor offloading helpers -// -// TODO: will be removed with backend v2 - -enum llm_offload_func_e { - OFFLOAD_FUNC_NOP, - OFFLOAD_FUNC, - OFFLOAD_FUNC_FRC, // force offload - OFFLOAD_FUNC_KQV, - OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, // embeddings - OFFLOAD_FUNC_OUT, -}; - -// TODO: will be removed with backend v2 -struct llm_offload_trie { - struct node { - ~node() { - for (int i = 0; i < 256; ++i) { - if (children[i]) { - delete children[i]; - } - } - } - - node * children[256] = { nullptr }; - llm_offload_func_e func = OFFLOAD_FUNC_NOP; - }; - - llm_offload_trie() { - root = new node; - } - - llm_offload_trie(const std::unordered_map & map) { - root = new node; - - for (const auto & kv : map) { - add(kv.first, kv.second); - } - } - - ~llm_offload_trie() { - delete root; - } - - void add(const char * name, llm_offload_func_e func) { - node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - cur->children[c] = new node; - } - - cur = cur->children[c]; - } - - cur->func = func; - } - - llm_offload_func_e find(const char * name) const { - const node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - return OFFLOAD_FUNC_NOP; - } - - cur = cur->children[c]; - } - - return cur->func; - } - - node * root = nullptr; -}; - -// TODO: will be removed with backend v2 -static const std::unordered_map k_offload_map = { - //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "pos_embd", OFFLOAD_FUNC_NR }, - - { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "KQ_mask", OFFLOAD_FUNC_FRC }, - { "K_shift", OFFLOAD_FUNC_FRC }, - - { "K_shifted", OFFLOAD_FUNC }, - - { "inp_norm", OFFLOAD_FUNC_NR }, - { "inp_norm_w", OFFLOAD_FUNC_NR }, - { "inp_norm_wb", OFFLOAD_FUNC_NR }, - - { "norm", OFFLOAD_FUNC }, - { "norm_w", OFFLOAD_FUNC }, - { "norm_wb", OFFLOAD_FUNC }, - - { "attn_norm", OFFLOAD_FUNC }, - { "attn_norm_2", OFFLOAD_FUNC }, - - { "wqkv", OFFLOAD_FUNC_KQV }, - { "bqkv", OFFLOAD_FUNC_KQV }, - { "wqkv_clamped", OFFLOAD_FUNC_KQV }, - - { "tmpk", OFFLOAD_FUNC_KQV }, - { "tmpq", OFFLOAD_FUNC_KQV }, - { "tmpv", OFFLOAD_FUNC_KQV }, - { "Kcur", OFFLOAD_FUNC_KQV }, - { "Qcur", OFFLOAD_FUNC_KQV }, - { "Vcur", OFFLOAD_FUNC_KQV }, - - { "krot", OFFLOAD_FUNC_KQV }, - { "qrot", OFFLOAD_FUNC_KQV }, - { "kpass", OFFLOAD_FUNC_KQV }, - { "qpass", OFFLOAD_FUNC_KQV }, - { "krotated", OFFLOAD_FUNC_KQV }, - { "qrotated", OFFLOAD_FUNC_KQV }, - - { "q", OFFLOAD_FUNC_KQV }, - { "k", OFFLOAD_FUNC_KQV }, - { "kq", OFFLOAD_FUNC_KQV }, - { "kq_scaled", OFFLOAD_FUNC_KQV }, - { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, - { "kq_masked", OFFLOAD_FUNC_KQV }, - { "kq_soft_max", OFFLOAD_FUNC_KQV }, - { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, - { "v", OFFLOAD_FUNC_KQV }, - { "kqv", OFFLOAD_FUNC_KQV }, - { "kqv_merged", OFFLOAD_FUNC_KQV }, - { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, - { "kqv_wo", OFFLOAD_FUNC_KQV }, - { "kqv_out", OFFLOAD_FUNC_KQV }, - - { "ffn_inp", OFFLOAD_FUNC }, - { "ffn_norm", OFFLOAD_FUNC }, - - { "ffn_up", OFFLOAD_FUNC }, - { "ffn_up_b", OFFLOAD_FUNC }, - { "ffn_gate", OFFLOAD_FUNC }, - { "ffn_gate_b", OFFLOAD_FUNC }, - { "ffn_gate_par", OFFLOAD_FUNC }, - { "ffn_act", OFFLOAD_FUNC }, - { "ffn_down", OFFLOAD_FUNC }, - { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_out", OFFLOAD_FUNC }, - - { "ffn_silu", OFFLOAD_FUNC }, - { "ffn_gelu", OFFLOAD_FUNC }, - { "ffn_relu", OFFLOAD_FUNC }, - { "ffn_sqr(relu)", OFFLOAD_FUNC }, - - { "ffn_moe_logits", OFFLOAD_FUNC }, - { "ffn_moe_probs", OFFLOAD_FUNC }, - { "ffn_moe_argsort", OFFLOAD_FUNC }, - { "ffn_moe_weights", OFFLOAD_FUNC }, - { "ffn_moe_weights_sum", OFFLOAD_FUNC }, - { "ffn_moe_weights_norm", OFFLOAD_FUNC }, - { "ffn_moe_weighted", OFFLOAD_FUNC }, - { "ffn_moe_up", OFFLOAD_FUNC }, - { "ffn_moe_gate", OFFLOAD_FUNC }, - { "ffn_moe_silu", OFFLOAD_FUNC }, - { "ffn_moe_gate_par", OFFLOAD_FUNC }, - { "ffn_moe_down", OFFLOAD_FUNC }, - { "ffn_moe_out", OFFLOAD_FUNC }, - - { "l_out", OFFLOAD_FUNC }, - - { "result_norm", OFFLOAD_FUNC_EMB }, - { "result_output_no_bias", OFFLOAD_FUNC_EMB }, - { "result_output", OFFLOAD_FUNC_OUT }, -}; - -static llm_offload_trie k_offload_func_trie(k_offload_map); - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); // keep track of the input that has already been allocated bool alloc_inp_tokens = false; @@ -6275,16 +5960,8 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif - - int n_non_view = 0; // number of non-view tensors that have been processed by the callback - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: will be removed with backend v2 + // TODO: improve handling of input and output tensors, then replace this with ggml_set_name llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -6295,12 +5972,11 @@ static struct ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // - // TODO: will be removed with backend v2 if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); @@ -6309,10 +5985,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_tokens = true; } - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6323,9 +5999,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; static_assert(std::is_same::value, "llama_pos must be int32_t"); @@ -6336,9 +6012,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6376,9 +6052,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; int32_t * data; @@ -6400,136 +6076,6 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - - // view tensors are not processed further - if (cur->view_src != nullptr) { - return; - } - - if (cur->op != GGML_OP_NONE) { - n_non_view++; - } - - // - // offload layers - // - // TODO: will be removed with backend v2 - -//#define LLAMA_OFFLOAD_DEBUG - - if (!do_offload) { - return; - } - - const int n_layer = model.hparams.n_layer; - - const int n_gpu_layers = model.n_gpu_layers; - const int i_gpu_start = n_layer - n_gpu_layers; - - // should we offload the final norm? yes if we are not computing embeddings - const bool offload_emb = lctx.embedding.empty(); - - static const std::unordered_map> k_offload_func_name = { - { OFFLOAD_FUNC_NOP, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - { OFFLOAD_FUNC, "GPU (CUDA)" }, - { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, - { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, - { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, - { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, -#else - { OFFLOAD_FUNC, "CPU" }, - { OFFLOAD_FUNC_FRC, "CPU" }, - { OFFLOAD_FUNC_KQV, "CPU" }, - { OFFLOAD_FUNC_NR, "CPU" }, - { OFFLOAD_FUNC_EMB, "CPU" }, -#endif // GGML_USE_CUBLAS - }; - - // check the global map for what offload function to use for this tensor - llm_offload_func_e func_e = k_offload_func_trie.find(name); - - if (func_e == OFFLOAD_FUNC_NOP) { -#ifdef LLAMA_OFFLOAD_DEBUG - // if a tensor hasn't been offloaded, we warn the user - if (worst_case) { - LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, - cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); - } -#endif - - return; - } - - // count the number of layers and respect the provided n_gpu_layers - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: - break; - case OFFLOAD_FUNC: - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - break; - case OFFLOAD_FUNC_FRC: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } break; - case OFFLOAD_FUNC_KQV: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } else { - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - } - break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_EMB: - if (!offload_emb || n_gpu_layers < n_layer) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - default: GGML_ASSERT(false); - } - - offload_func_t func = ggml_offload_nop; - - // this is needed for compatibility with Metal for example -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; -#else - static offload_func_t ggml_offload_gpu = ggml_offload_nop; -#endif - - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; - case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQV: - case OFFLOAD_FUNC_FRC: - case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; - default: GGML_ASSERT(false); - } - - // apply offload function to the tensor - func(cur); - -#ifdef LLAMA_OFFLOAD_DEBUG - if (worst_case) { - LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); - } -#endif }; struct ggml_cgraph * result = NULL; @@ -6597,27 +6143,6 @@ static struct ggml_cgraph * llama_build_graph( llm.free(); - if (worst_case) { - int n_non_view_total = 0; - - for (int i = 0; i < result->n_nodes; ++i) { - if (result->nodes[i]->view_src == nullptr) { - n_non_view_total++; - } - } - - LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); - - if (n_non_view != n_non_view_total) { - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); - LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); - LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); - LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - } - } - return result; } @@ -6663,8 +6188,6 @@ static int llama_decode_internal( auto & kv_self = lctx.kv_self; - GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -6718,12 +6241,8 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_allocr_reset(lctx.alloc); - ggml_cgraph * gf = llama_build_graph(lctx, batch); - ggml_allocr_alloc_graph(lctx.alloc, gf); - // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; GGML_ASSERT(strcmp(res->name, "result_output") == 0); @@ -6735,30 +6254,6 @@ static int llama_decode_internal( GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - ggml_cuda_copy_to_device(node); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - } - } - - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed - if (!lctx.embedding.empty()) { - embeddings->backend = GGML_BACKEND_CPU; - } - res->backend = GGML_BACKEND_CPU; -#endif - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread @@ -6781,15 +6276,17 @@ static int llama_decode_internal( #endif #ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend)) { - ggml_backend_metal_set_n_cb(lctx.backend, n_threads); + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); } #endif - if (ggml_backend_is_cpu(lctx.backend)) { - ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); } - ggml_backend_graph_compute(lctx.backend, gf); + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); #ifdef GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); @@ -9338,48 +8835,23 @@ static int llama_apply_lora_from_file_internal( LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - // create a name -> tensor map of the model to accelerate lookups - // find the max tensor size to estimate the required temporary buffer size - size_t max_tensor_size = 0; - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - size_t f32_size = ggml_nelements(kv.second) * sizeof(float); - max_tensor_size = std::max(max_tensor_size, f32_size); - } - - // create a temporary ggml context to store the lora tensors - // TODO: use ggml-alloc - size_t lora_ctx_size = max_tensor_size * 3; - LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); - std::vector lora_buf(lora_ctx_size); - - struct ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - using unique_context = std::unique_ptr; - - unique_context lora_ctx(nullptr, ggml_free); - lora_ctx.reset(ggml_init(params)); - std::unordered_map lora_tensors; - // load base model std::unique_ptr ml; - - if (path_base_model) { + if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(false); // no prefetching + ml->init_mapping(/*prefetch*/ false); // no prefetching } - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; + struct tensor_meta { + std::string name; + ggml_type type; + int32_t ne[2]; + size_t offset; + }; + std::map tensor_meta_map; + // load all tensor meta while (true) { if (fin.tell() == fin.size) { // eof @@ -9392,7 +8864,7 @@ static int llama_apply_lora_from_file_internal( fin.read_raw(&n_dims, sizeof(n_dims)); fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); + fin.read_raw(&ftype, sizeof(ftype)); if (n_dims != 1 && n_dims != 2) { LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); @@ -9406,31 +8878,23 @@ static int llama_apply_lora_from_file_internal( std::string name; { - GGML_ASSERT(name_len <= 1024); - char buf[1024]; + GGML_ASSERT(name_len < GGML_MAX_NAME); + char buf[GGML_MAX_NAME]; fin.read_raw(buf, name_len); name = std::string(buf, name_len); } - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { + // check for lora suffix + std::string lora_suffix; + if (name.length() > 6) { + lora_suffix = name.substr(name.length() - 6); + } + if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); return 1; } - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); - return 1; - } - - // create ggml tensor + // tensor type ggml_type wtype; switch (ftype) { case 0: wtype = GGML_TYPE_F32; break; @@ -9442,122 +8906,177 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); - ggml_set_name(lora_tensor, name.c_str()); - // load tensor data + // data offset size_t offset = fin.tell(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seek(offset, SEEK_SET); - fin.read_raw(lora_tensor->data, tensor_data_size); - lora_tensors[name] = lora_tensor; + // skip tensor data + fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + } - ggml_tensor * dest_t = model_tensors[base_name]; + bool warned = false; + int n_tensors = 0; - offload_func_t offload_func = ggml_offload_nop; - offload_func_t offload_func_force_inplace = ggml_offload_nop; + // apply + ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); + return 1; + } + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); - } - offload_func = ggml_cuda_assign_buffers; - offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; - } -#endif // GGML_USE_CUBLAS + std::vector> read_buf; + for (const auto & it : model.tensors_by_name) { + const std::string & base_name = it.first; + ggml_tensor * model_t = it.second; - ggml_tensor * base_t; - if (ml) { - struct gguf_context * ctx_gguf = ml->ctx_gguf; + if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || + tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { + continue; + } - // load from base model - if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } + tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); + tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - base_t = ml->get_tensor_meta(base_name.c_str()); - ml->load_data_for(base_t); - } else { - base_t = dest_t; - } + ggml_init_params lora_init_params = { + /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + ggml_context * lora_ctx = ggml_init(lora_init_params); + if (lora_ctx == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); + ggml_backend_free(backend_cpu); + return 1; + } - if (ggml_is_quantized(base_t->type)) { - if (!warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - } + // create tensors + ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); + ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); + ggml_set_name(loraA, metaA.name.c_str()); + ggml_set_name(loraB, metaB.name.c_str()); - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - GGML_ASSERT(loraA->type == GGML_TYPE_F32); - ggml_set_name(loraA, "loraA"); - - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - GGML_ASSERT(loraB->type == GGML_TYPE_F32); - ggml_set_name(loraB, "loraB"); - - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + ggml_tensor * base_t; + if (ml) { + if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } + base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); + } else { + base_t = ggml_dup_tensor(lora_ctx, model_t); + } + ggml_set_name(base_t, base_name.c_str()); + // allocate in backend buffer + ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (lora_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); + return 1; + } + + // load tensor data + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { + read_buf.resize(ggml_nbytes(tensor)); + fin.seek(tensor_meta.offset, SEEK_SET); + fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); + }; + load_tensor(metaA, loraA); + load_tensor(metaB, loraB); + + // load base model tensor data + if (ml) { + ml->load_data_for(base_t); + } else { + ggml_backend_tensor_copy(model_t, base_t); + } + + if (ggml_is_quantized(base_t->type) && !warned) { + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } + + auto build_lora_graph = [&]() { // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); - offload_func(BA); + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling); - offload_func(BA); + BA = ggml_scale(lora_ctx, BA, scaling); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; - if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); - offload_func_force_inplace(r); - ggml_set_name(r, "r_add_inplace"); - } - else { - r = ggml_add(lora_ctx.get(), base_t, BA); - offload_func(r); - ggml_set_name(r, "r_add"); + r = ggml_add_inplace(lora_ctx, base_t, BA); + ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx.get(), r, dest_t); - offload_func(r); - ggml_set_name(r, "r_cpy"); + if (base_t->type != model_t->type) { + // convert the result to the model type + r = ggml_cast(lora_ctx, r, model_t->type); + ggml_set_name(r, "r_cast"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); - ggml_build_forward_expand(gf, r); + return r; + }; - ggml_graph_compute_helper(work_buffer, gf, n_threads); + ggml_cgraph * gf = ggml_new_graph(lora_ctx); + ggml_tensor * r = build_lora_graph(); + ggml_build_forward_expand(gf, r); - // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other - GGML_ASSERT(lora_tensors.size() == 2); + ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (graph_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } - // we won't need these tensors again, reset the context to save memory - lora_ctx.reset(ggml_init(params)); - lora_tensors.clear(); + ggml_backend_graph_compute(backend_cpu, gf); - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); - } + ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); + +#if 0 + // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU + //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); + + // sched compute + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_init_measure(sched, gf); + + // create the graph again, since the previous one was destroyed by the measure + ggml_graph_clear(gf); + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_graph_compute(sched, gf); + ggml_backend_sched_free(sched); +#endif + + ggml_backend_buffer_free(lora_buf); + ggml_backend_buffer_free(graph_buf); + ggml_free(lora_ctx); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_LOG_INFO("."); } } + ggml_backend_free(backend_cpu); + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); @@ -9570,6 +9089,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, + /*.split_mode =*/ LLAMA_SPLIT_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -9581,7 +9101,8 @@ struct llama_model_params llama_model_default_params() { }; #ifdef GGML_USE_METAL - result.n_gpu_layers = 1; + // note: we usually have plenty of VRAM, so by default offload all layers to the GPU + result.n_gpu_layers = 999; #endif return result; @@ -9667,9 +9188,11 @@ int64_t llama_time_us(void) { return ggml_time_us(); } -struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_model_params params) { +static struct llama_model * llama_load_model_from_file_internal( + const char * path_model, struct llama_model_params * params_p +) { + auto & params = *params_p; + ggml_time_init(); llama_model * model = new llama_model; @@ -9706,6 +9229,10 @@ struct llama_model * llama_load_model_from_file( return model; } +struct llama_model * llama_load_model_from_file(const char * path_model, struct llama_model_params params) { + return llama_load_model_from_file_internal(path_model, ¶ms); +} + void llama_free_model(struct llama_model * model) { delete model; } @@ -9771,41 +9298,61 @@ struct llama_context * llama_new_context_with_model( GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - // reserve memory for context buffers if (!hparams.vocab_only) { - // initialize backend + // initialize backends #ifdef GGML_USE_METAL if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_metal_init(); - if (ctx->backend == nullptr) { + ctx->backend_metal = ggml_backend_metal_init(); + if (ctx->backend_metal == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); } + ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only +#elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_cuda_init(0); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } } +#elif defined(GGML_USE_KOMPUTE) + if (ggml_vk_has_device() && model->n_gpu_layers > 0) { + auto * backend = ggml_backend_kompute_init(); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif - - if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { - ctx->backend = ggml_backend_cpu_init(); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - } - } - - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); + ctx->backend_cpu = ggml_backend_cpu_init(); + if (ctx->backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); delete ctx; return nullptr; } + ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, - cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, + cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9841,11 +9388,11 @@ struct llama_context * llama_new_context_with_model( } { - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); - // create measure allocator - ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9853,50 +9400,19 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); - // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf); + // initialize scheduler with the worst-case graph + ggml_backend_sched_init_measure(ctx->sched, gf); + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0); - - // create allocator again with exact memory requirements - ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); - ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (model->n_gpu_layers > 0) { - // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets - ggml_cuda_set_scratch_size(alloc_size + 64); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - - // calculate total VRAM usage - auto add_tensor = [](const ggml_tensor * t, size_t & size) { - if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { - size += ggml_nbytes(t); - } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } - - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; - - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + for (ggml_backend_t backend : ctx->backends) { + ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_name(backend), + ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } -#endif } } @@ -9919,6 +9435,9 @@ struct llama_context * llama_new_context_with_model( void llama_free(struct llama_context * ctx) { delete ctx; +#ifdef GGML_USE_KOMPUTE + ggml_vk_free_device(); +#endif } const llama_model * llama_get_model(const struct llama_context * ctx) { @@ -9993,9 +9512,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 } int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "%s %s%s %s", + return snprintf(buf, buf_size, "%s %s %s", llama_model_arch_name(model->arch).c_str(), - model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -10017,7 +9535,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) { } struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - return ggml_get_tensor(model->ctx, name); + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; } uint32_t llama_model_quantize( @@ -10202,7 +9727,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf); + const size_t s_kv = ctx->kv_self.total_size(); const size_t s_total = ( + s_rng_size @@ -10331,7 +9856,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); + const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -10344,46 +9869,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - - std::vector kout2d(n_layer); - std::vector vout2d(n_layer); - - for (int il = 0; il < (int) n_layer; ++il) { - kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il])); - } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - ggml_backend_graph_compute(ctx->backend, gf); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(ggml_nbytes(kout2d[il])); - ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); - tmp_buf.resize(ggml_nbytes(vout2d[il])); - ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // v is not contiguous, copy row by row + tmp_buf.resize(elt_size*kv_head); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + } } - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } for (uint32_t i = 0; i < kv_size; ++i) { @@ -10482,48 +9980,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size); + GGML_ASSERT(kv_self.total_size() == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); + for (int il = 0; il < (int) n_layer; ++il) { + size_t k_size = elt_size*n_embd_k_gqa*kv_head; + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); + inp += k_size; - std::vector kin2d(n_layer); - std::vector vin2d(n_layer); - - for (int il = 0; il < n_layer; ++il) { - kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d)); + // v is not contiguous, copy row by row + size_t v_row_size = elt_size*kv_head; + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + inp += v_row_size; + } } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - // load data into the tensors - for (int il = 0; il < n_layer; ++il) { - ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il])); - inp += ggml_nbytes(kin2d[il]); - - ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il])); - inp += ggml_nbytes(vin2d[il]); - } - - ggml_backend_graph_compute(ctx->backend, gf); - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } ctx->kv_self.head = kv_head; diff --git a/llama.h b/llama.h index c11075bbc..7abc9d099 100644 --- a/llama.h +++ b/llama.h @@ -45,7 +45,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 3 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif @@ -116,6 +116,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_split_mode { + LLAMA_SPLIT_NONE = 0, // single GPU + LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_ROW = 2, // split rows across GPUs + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -178,8 +184,15 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + enum llama_split_mode split_mode; // how to split the model across multiple GPUs + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored + int32_t main_gpu; + // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + const float * tensor_split; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7a60d7743..a0063bbb9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -360,7 +360,10 @@ struct test_case { // check if backends support op bool supported = true; for (ggml_backend_t backend : {backend1, backend2}) { - if (!ggml_backend_supports_op(backend, out)) { + if ( + !ggml_backend_supports_op(backend, out) + || (op_desc(out) == "MOE" && !strcmp(ggml_backend_name(backend), "Kompute")) + ) { printf("not supported [%s] ", ggml_backend_name(backend)); supported = false; } @@ -376,6 +379,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); + if (buf == NULL) { + printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); + ggml_free(ctx); + return false; + } // build graph ggml_build_forward_expand(gf, out); @@ -463,19 +471,23 @@ struct test_case { GGML_UNUSED(index); }; - ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); + const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); - if (ud.ok) { - printf("\033[1;32mOK\033[0m\n"); - } else { - printf("\033[1;31mFAIL\033[0m\n"); + if (!cmp_ok) { + printf("compare failed "); } ggml_backend_buffer_free(buf); ggml_free(ctx); - return ud.ok; + if (ud.ok && cmp_ok) { + printf("\033[1;32mOK\033[0m\n"); + return true; + } + + printf("\033[1;31mFAIL\033[0m\n"); + return false; } bool eval_perf(ggml_backend_t backend, const char * op_name) { @@ -519,6 +531,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (buf == NULL) { + printf("failed to allocate tensors\n"); + ggml_free(ctx); + return false; + } // randomize tensors initialize_tensors(ctx);