diff --git a/README.md b/README.md index 0378a674e..6ab6acf12 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well. - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) +- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index 9ab9ab06e..c4ec5c524 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -840,6 +840,8 @@ class OutputFile: self.gguf.add_base_model_version(key, base_model_entry["version"]) if "organization" in base_model_entry: self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "description" in base_model_entry: + self.gguf.add_base_model_description(key, base_model_entry["description"]) if "url" in base_model_entry: self.gguf.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -849,12 +851,32 @@ class OutputFile: if "repo_url" in base_model_entry: self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + if metadata.datasets is not None: + self.gguf.add_dataset_count(len(metadata.datasets)) + for key, dataset_entry in enumerate(metadata.datasets): + if "name" in dataset_entry: + self.gguf.add_dataset_name(key, dataset_entry["name"]) + if "author" in dataset_entry: + self.gguf.add_dataset_author(key, dataset_entry["author"]) + if "version" in dataset_entry: + self.gguf.add_dataset_version(key, dataset_entry["version"]) + if "organization" in dataset_entry: + self.gguf.add_dataset_organization(key, dataset_entry["organization"]) + if "description" in dataset_entry: + self.gguf.add_dataset_description(key, dataset_entry["description"]) + if "url" in dataset_entry: + self.gguf.add_dataset_url(key, dataset_entry["url"]) + if "doi" in dataset_entry: + self.gguf.add_dataset_doi(key, dataset_entry["doi"]) + if "uuid" in dataset_entry: + self.gguf.add_dataset_uuid(key, dataset_entry["uuid"]) + if "repo_url" in dataset_entry: + self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"]) + if metadata.tags is not None: self.gguf.add_tags(metadata.tags) if metadata.languages is not None: self.gguf.add_languages(metadata.languages) - if metadata.datasets is not None: - self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself diff --git a/examples/server/README.md b/examples/server/README.md index c911745f4..6f72c6bb8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -383,6 +383,10 @@ node index.js `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. + + `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` @@ -411,7 +415,7 @@ node index.js `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. **Response format** diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a6d3a1c95..cac55007a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -655,11 +655,16 @@ struct server_context { } bool validate_model_chat_template() const { - llama_chat_message chat[] = {{"user", "test"}}; - - const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); - - return res > 0; + std::vector model_template(2048, 0); // longest known template is about 1200 bytes + std::string template_key = "tokenizer.chat_template"; + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + if (res >= 0) { + llama_chat_message chat[] = {{"user", "test"}}; + std::string tmpl = std::string(model_template.data(), model_template.size()); + int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); + return chat_res > 0; + } + return false; } void init() { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 255bc64c6..2dba15d23 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4350,6 +4350,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g if (op->op == GGML_OP_MUL_MAT) { a = op->src[0]; b = op->src[1]; + if (ggml_is_permuted(a) || ggml_is_permuted(b)) { + // TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021 + return false; + } } else { a = op->src[2]; b = op->src[1]; diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index b3159b9d1..72d8fdb87 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -8,7 +8,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep const int nthreads = item_ct1.get_local_range(2); const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); sycl::float2 mean_var = sycl::float2(0.f, 0.f); for (int col = tid; col < ncols; col += block_size) { @@ -55,7 +54,6 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con int end = start + group_size; const int nthreads = item_ct1.get_local_range(2); const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); start += item_ct1.get_local_id(2); int nreduce = nwarps / WARP_SIZE; @@ -144,7 +142,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa const int tid = item_ct1.get_local_id(2); const int nthreads = item_ct1.get_local_range(2); const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { @@ -202,6 +199,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:17: The work-group size passed to the SYCL kernel may exceed @@ -244,6 +242,7 @@ static void group_norm_f32_sycl(const float* x, float* dst, } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:18: The work-group size passed to the SYCL kernel may exceed @@ -290,6 +289,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:19: The work-group size passed to the SYCL kernel may exceed diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index c2779df0e..e61cdc2ca 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -1,4 +1,5 @@ #include +#include #include "outprod.hpp" @@ -39,7 +40,7 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr try { // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::gemm(*stream, + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 6c4c92262..ec31e726a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -196,6 +196,7 @@ struct vk_device_struct { vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; @@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin std::lock_guard guard(compile_count_mutex); assert(compile_count > 0); compile_count--; + + // "Progress bar" for shader compiles + static uint32_t total_compile_count = 0; + if ((total_compile_count++ % 10) == 0) { + std::cerr << "."; + } } compile_count_cond.notify_all(); } @@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); + std::cerr << "ggml_vulkan: Compiling shaders"; + // mulmat std::initializer_list warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; @@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) { for (auto &c : compiles) { c.wait(); } + std::cerr << "Done!" << std::endl; } static vk_device ggml_vk_get_device(size_t idx) { @@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { - if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - return ctx->device->pipeline_cpy_f32_f32; +static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { + + // Choose "contiguous copy" shader if src/dst are contiguous + bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst)); + + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } } - if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - return ctx->device->pipeline_cpy_f32_f16; + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f16; + } else { + return ctx->device->pipeline_cpy_f32_f16; + } } - if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - return ctx->device->pipeline_cpy_f16_f16; + if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } } - std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; + std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; GGML_ABORT("fatal error"); } @@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); + std::array elements; + + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } const vk_op_unary_push_constants pc = { (uint32_t)ne, @@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& 0.0f, 0.0f, }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); + return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_norm_f32; @@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_DIV: case GGML_OP_CONCAT: case GGML_OP_UPSCALE: - case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SIN: case GGML_OP_COS: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp index 7071302a4..ae8fa8753 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp new file mode 100644 index 000000000..9acbdd3d2 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp @@ -0,0 +1,42 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +#extension GL_EXT_control_flow_attributes : require + +const uint num_threads = 128; + +layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; + +void main() { + uint idx = get_idx(); + + // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation + const uint num_iter = 4; + + // fast path for when all four iterations are in-bounds + if (idx + (num_iter-1)*num_threads < p.ne) { + [[unroll]] for (uint i = 0; i < num_iter; ++i) { +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + idx] = D_TYPE(data_a[idx]); +#else + data_d[p.d_offset + idx] = data_a[idx]; +#endif + idx += num_threads; + } + } else { + [[unroll]] for (uint i = 0; i < num_iter; ++i) { + if (idx >= p.ne) { + continue; + } + +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + idx] = D_TYPE(data_a[idx]); +#else + data_d[p.d_offset + idx] = data_a[idx]; +#endif + idx += num_threads; + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp index c26917c0f..2775068f9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp index f9a858cbf..fbd9d272c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp index eacdefc7d..4e1fa3af3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp @@ -1,4 +1,5 @@ #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_control_flow_attributes : require layout (push_constant) uniform parameter { @@ -9,8 +10,6 @@ layout (push_constant) uniform parameter float param1; float param2; } p; -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp index a465cd52b..e87d8b18b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp index a86af87e7..c03f737cc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + uint src0_idx_mod(uint idx) { const uint i13 = idx / (p.ne12*p.ne11*p.ne10); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp index 5cd2f668d..5cfee8c3b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp @@ -3,12 +3,22 @@ #include "types.comp" #include "generic_unary_head.comp" +const uint num_threads = 128; + +layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; + void main() { - const uint idx = get_idx(); + uint idx = get_idx(); - if (idx >= p.ne) { - return; + // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation + const uint num_iter = 4; + + [[unroll]] for (uint i = 0; i < num_iter; ++i) { + if (idx >= p.ne) { + continue; + } + + data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1)); + idx += num_threads; } - - data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp index 7faf9be93..67c48fb9a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp index 1fa118c99..2ff48ddc5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 477355c29..5c84f473f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -350,6 +350,9 @@ void process_shaders() { string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); + string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7ab08b036..bc2b649d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -64,15 +64,27 @@ class Keys: BASE_MODEL_AUTHOR = "general.base_model.{id}.author" BASE_MODEL_VERSION = "general.base_model.{id}.version" BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" + BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper BASE_MODEL_DOI = "general.base_model.{id}.doi" BASE_MODEL_UUID = "general.base_model.{id}.uuid" BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) + # Dataset Source + DATASET_COUNT = "general.dataset.count" + DATASET_NAME = "general.dataset.{id}.name" + DATASET_AUTHOR = "general.dataset.{id}.author" + DATASET_VERSION = "general.dataset.{id}.version" + DATASET_ORGANIZATION = "general.dataset.{id}.organization" + DATASET_DESCRIPTION = "general.dataset.{id}.description" + DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper + DATASET_DOI = "general.dataset.{id}.doi" + DATASET_UUID = "general.dataset.{id}.uuid" + DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) + # Array based KV stores TAGS = "general.tags" LANGUAGES = "general.languages" - DATASETS = "general.datasets" class LLM: VOCAB_SIZE = "{arch}.vocab_size" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0d8d8a0b0..7a55d1296 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -568,6 +568,9 @@ class GGUFWriter: def add_base_model_organization(self, source_id: int, organization: str) -> None: self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization) + def add_base_model_description(self, source_id: int, description: str) -> None: + self.add_string(Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description) + def add_base_model_url(self, source_id: int, url: str) -> None: self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url) @@ -580,15 +583,42 @@ class GGUFWriter: def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None: self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url) + def add_dataset_count(self, source_count: int) -> None: + self.add_uint32(Keys.General.DATASET_COUNT, source_count) + + def add_dataset_name(self, source_id: int, name: str) -> None: + self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name) + + def add_dataset_author(self, source_id: int, author: str) -> None: + self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author) + + def add_dataset_version(self, source_id: int, version: str) -> None: + self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version) + + def add_dataset_organization(self, source_id: int, organization: str) -> None: + self.add_string(Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization) + + def add_dataset_description(self, source_id: int, description: str) -> None: + self.add_string(Keys.General.DATASET_DESCRIPTION.format(id=source_id), description) + + def add_dataset_url(self, source_id: int, url: str) -> None: + self.add_string(Keys.General.DATASET_URL.format(id=source_id), url) + + def add_dataset_doi(self, source_id: int, doi: str) -> None: + self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi) + + def add_dataset_uuid(self, source_id: int, uuid: str) -> None: + self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid) + + def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None: + self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url) + def add_tags(self, tags: Sequence[str]) -> None: self.add_array(Keys.General.TAGS, tags) def add_languages(self, languages: Sequence[str]) -> None: self.add_array(Keys.General.LANGUAGES, languages) - def add_datasets(self, datasets: Sequence[str]) -> None: - self.add_array(Keys.General.DATASETS, datasets) - def add_tensor_data_layout(self, layout: str) -> None: self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index db318542a..321cbcd4c 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -41,7 +41,7 @@ class Metadata: base_models: Optional[list[dict]] = None tags: Optional[list[str]] = None languages: Optional[list[str]] = None - datasets: Optional[list[str]] = None + datasets: Optional[list[dict]] = None @staticmethod def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata: @@ -91,9 +91,11 @@ class Metadata: # Base Models is received here as an array of models metadata.base_models = metadata_override.get("general.base_models", metadata.base_models) + # Datasets is received here as an array of datasets + metadata.datasets = metadata_override.get("general.datasets", metadata.datasets) + metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) metadata.languages = metadata_override.get(Keys.General.LANGUAGES, metadata.languages) - metadata.datasets = metadata_override.get(Keys.General.DATASETS, metadata.datasets) # Direct Metadata Override (via direct cli argument) if model_name is not None: @@ -346,12 +348,12 @@ class Metadata: use_model_card_metadata("author", "model_creator") use_model_card_metadata("basename", "model_type") - if "base_model" in model_card: + if "base_model" in model_card or "base_models" in model_card or "base_model_sources" in model_card: # This represents the parent models that this is based on # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges) # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md metadata_base_models = [] - base_model_value = model_card.get("base_model", None) + base_model_value = model_card.get("base_model", model_card.get("base_models", model_card.get("base_model_sources", None))) if base_model_value is not None: if isinstance(base_model_value, str): @@ -364,18 +366,106 @@ class Metadata: for model_id in metadata_base_models: # NOTE: model size of base model is assumed to be similar to the size of the current model - model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params) base_model = {} - if model_full_name_component is not None: - base_model["name"] = Metadata.id_to_title(model_full_name_component) - if org_component is not None: - base_model["organization"] = Metadata.id_to_title(org_component) - if version is not None: - base_model["version"] = version - if org_component is not None and model_full_name_component is not None: - base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" + if isinstance(model_id, str): + if model_id.startswith("http://") or model_id.startswith("https://") or model_id.startswith("ssh://"): + base_model["repo_url"] = model_id + + # Check if Hugging Face ID is present in URL + if "huggingface.co" in model_id: + match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", model_id) + if match: + model_id_component = match.group(1) + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id_component, total_params) + + # Populate model dictionary with extracted components + if model_full_name_component is not None: + base_model["name"] = Metadata.id_to_title(model_full_name_component) + if org_component is not None: + base_model["organization"] = Metadata.id_to_title(org_component) + if version is not None: + base_model["version"] = version + + else: + # Likely a Hugging Face ID + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params) + + # Populate model dictionary with extracted components + if model_full_name_component is not None: + base_model["name"] = Metadata.id_to_title(model_full_name_component) + if org_component is not None: + base_model["organization"] = Metadata.id_to_title(org_component) + if version is not None: + base_model["version"] = version + if org_component is not None and model_full_name_component is not None: + base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" + + elif isinstance(model_id, dict): + base_model = model_id + + else: + logger.error(f"base model entry '{str(model_id)}' not in a known format") + metadata.base_models.append(base_model) + if "datasets" in model_card or "dataset" in model_card or "dataset_sources" in model_card: + # This represents the datasets that this was trained from + metadata_datasets = [] + dataset_value = model_card.get("datasets", model_card.get("dataset", model_card.get("dataset_sources", None))) + + if dataset_value is not None: + if isinstance(dataset_value, str): + metadata_datasets.append(dataset_value) + elif isinstance(dataset_value, list): + metadata_datasets.extend(dataset_value) + + if metadata.datasets is None: + metadata.datasets = [] + + for dataset_id in metadata_datasets: + # NOTE: model size of base model is assumed to be similar to the size of the current model + dataset = {} + if isinstance(dataset_id, str): + if dataset_id.startswith(("http://", "https://", "ssh://")): + dataset["repo_url"] = dataset_id + + # Check if Hugging Face ID is present in URL + if "huggingface.co" in dataset_id: + match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", dataset_id) + if match: + dataset_id_component = match.group(1) + dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id_component, total_params) + + # Populate dataset dictionary with extracted components + if dataset_name_component is not None: + dataset["name"] = Metadata.id_to_title(dataset_name_component) + if org_component is not None: + dataset["organization"] = Metadata.id_to_title(org_component) + if version is not None: + dataset["version"] = version + + else: + # Likely a Hugging Face ID + dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id, total_params) + + # Populate dataset dictionary with extracted components + if dataset_name_component is not None: + dataset["name"] = Metadata.id_to_title(dataset_name_component) + if org_component is not None: + dataset["organization"] = Metadata.id_to_title(org_component) + if version is not None: + dataset["version"] = version + if org_component is not None and dataset_name_component is not None: + dataset["repo_url"] = f"https://huggingface.co/{org_component}/{dataset_name_component}" + + elif isinstance(dataset_id, dict): + dataset = dataset_id + + else: + logger.error(f"dataset entry '{str(dataset_id)}' not in a known format") + + metadata.datasets.append(dataset) + use_model_card_metadata("license", "license") use_model_card_metadata("license_name", "license_name") use_model_card_metadata("license_link", "license_link") @@ -386,9 +476,6 @@ class Metadata: use_array_model_card_metadata("languages", "languages") use_array_model_card_metadata("languages", "language") - use_array_model_card_metadata("datasets", "datasets") - use_array_model_card_metadata("datasets", "dataset") - # Hugging Face Parameter Heuristics #################################### @@ -493,6 +580,8 @@ class Metadata: gguf_writer.add_base_model_version(key, base_model_entry["version"]) if "organization" in base_model_entry: gguf_writer.add_base_model_organization(key, base_model_entry["organization"]) + if "description" in base_model_entry: + gguf_writer.add_base_model_description(key, base_model_entry["description"]) if "url" in base_model_entry: gguf_writer.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -502,9 +591,29 @@ class Metadata: if "repo_url" in base_model_entry: gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"]) + if self.datasets is not None: + gguf_writer.add_dataset_count(len(self.datasets)) + for key, dataset_entry in enumerate(self.datasets): + if "name" in dataset_entry: + gguf_writer.add_dataset_name(key, dataset_entry["name"]) + if "author" in dataset_entry: + gguf_writer.add_dataset_author(key, dataset_entry["author"]) + if "version" in dataset_entry: + gguf_writer.add_dataset_version(key, dataset_entry["version"]) + if "organization" in dataset_entry: + gguf_writer.add_dataset_organization(key, dataset_entry["organization"]) + if "description" in dataset_entry: + gguf_writer.add_dataset_description(key, dataset_entry["description"]) + if "url" in dataset_entry: + gguf_writer.add_dataset_url(key, dataset_entry["url"]) + if "doi" in dataset_entry: + gguf_writer.add_dataset_doi(key, dataset_entry["doi"]) + if "uuid" in dataset_entry: + gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"]) + if "repo_url" in dataset_entry: + gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"]) + if self.tags is not None: gguf_writer.add_tags(self.tags) if self.languages is not None: gguf_writer.add_languages(self.languages) - if self.datasets is not None: - gguf_writer.add_datasets(self.datasets) diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py index 81a2a30ae..40d484f4e 100755 --- a/gguf-py/tests/test_metadata.py +++ b/gguf-py/tests/test_metadata.py @@ -182,8 +182,43 @@ class TestMetadataMethod(unittest.TestCase): expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}] expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'] expect.languages=['en'] - expect.datasets=['teknium/OpenHermes-2.5'] + expect.datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}] + self.assertEqual(got, expect) + # Base Model spec is inferred from model id + model_card = {'base_models': 'teknium/OpenHermes-2.5'} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) + self.assertEqual(got, expect) + + # Base Model spec is only url + model_card = {'base_models': ['https://huggingface.co/teknium/OpenHermes-2.5']} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) + self.assertEqual(got, expect) + + # Base Model spec is given directly + model_card = {'base_models': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) + self.assertEqual(got, expect) + + # Dataset spec is inferred from model id + model_card = {'datasets': 'teknium/OpenHermes-2.5'} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) + self.assertEqual(got, expect) + + # Dataset spec is only url + model_card = {'datasets': ['https://huggingface.co/teknium/OpenHermes-2.5']} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) + self.assertEqual(got, expect) + + # Dataset spec is given directly + model_card = {'datasets': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) def test_apply_metadata_heuristic_from_hf_parameters(self): diff --git a/include/llama.h b/include/llama.h index ccb48f73c..5e742642e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -797,7 +797,7 @@ extern "C" { // Processes a batch of tokens with the ecoder part of the encoder-decoder model. // Stores the encoder output internally for later use by the decoder cross-attention layers. // 0 - success - // < 0 - error + // < 0 - error. the KV cache state is restored to the state before this call LLAMA_API int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch); @@ -805,7 +805,7 @@ extern "C" { // Positive return values does not mean a fatal error, but rather a warning. // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) - // < 0 - error + // < 0 - error. the KV cache state is restored to the state before this call LLAMA_API int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch); diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index e82984f49..199237a21 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -89952d649e0c5cabbb9ff8c4906f5a843a789fb2 +8a3d799484d861748f86eb87c8314fa2dbccc254 diff --git a/src/llama.cpp b/src/llama.cpp index 7e7a61196..6ec419e9b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3502,11 +3502,24 @@ static bool llama_kv_cache_init( return true; } +// a structure holds information about the slot found in llama_kv_cache_find_slot +struct llama_kv_cache_slot_info { + std::pair boundaries; // slot boundaries [begin, end) + bool found = false; // the slot was found + + explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} + llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} + + operator bool() const { return found; } +}; +static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; + // find an empty slot of size "n_tokens" in the cache // updates the cache head +// returns a structure holding information about the slot found // Note: On success, it's important that cache.head points // to the first cell of the slot. -static bool llama_kv_cache_find_slot( +static struct llama_kv_cache_slot_info llama_kv_cache_find_slot( struct llama_kv_cache & cache, const struct llama_ubatch & batch) { const uint32_t n_tokens = batch.n_tokens; @@ -3534,7 +3547,7 @@ static bool llama_kv_cache_find_slot( // too big seq_id // TODO: would it be possible to resize the cache instead? LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); - return false; + return llama_kv_cache_slot_info_failed; } if (j > 0) { llama_kv_cell & seq = cache.cells[seq_id]; @@ -3669,15 +3682,17 @@ static bool llama_kv_cache_find_slot( // allow getting the range of used cells, from head to head + n cache.head = min; cache.n = max - min + 1; + cache.used = std::count_if(cache.cells.begin(), cache.cells.end(), + [](const llama_kv_cell& cell){ return !cell.is_empty(); }); // sanity check - return cache.n >= n_seqs; + return llama_kv_cache_slot_info(cache.n >= n_seqs); } // otherwise, one cell per token. if (n_tokens > cache.size) { LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); - return false; + return llama_kv_cache_slot_info_failed; } uint32_t n_tested = 0; @@ -3705,7 +3720,7 @@ static bool llama_kv_cache_find_slot( if (n_tested >= cache.size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); - return false; + return llama_kv_cache_slot_info_failed; } } @@ -3722,7 +3737,7 @@ static bool llama_kv_cache_find_slot( cache.used += n_tokens; - return true; + return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens); } // find how many cells are currently in use @@ -3998,6 +4013,53 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) return cparams.flash_attn ? 256u : 32u; } +// saves the kv_cache state for future recovery. +// used to rollback llama_kv_cache_find_slot changes. +struct llama_kv_slot_restorer { + struct llama_kv_cache_state { + uint32_t head = 0; + uint32_t n = 0; + } old_state; + + // for non-recurrent models only + // list of slots to restore + std::vector> slot_boundaries; + + bool do_restore = false; + + explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) { + old_state.head = cache.head; + old_state.n = cache.n; + } + + // saves a slot information for future restoration + void save(const struct llama_kv_cache_slot_info & slot) { + if (slot) { + do_restore = true; + if (slot.boundaries.first != slot.boundaries.second) { + slot_boundaries.push_back(slot.boundaries); + } + } + } + + // must be explicitly called to restore the kv_cache state + // and rollback changes from all llama_kv_cache_find_slot calls + void restore(struct llama_kv_cache & cache) { + if (do_restore) { + cache.head = old_state.head; + cache.n = old_state.n; + + if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased + llama_kv_cache_seq_rm(cache, -1, -1, -1); + } else { + for (auto & slot : slot_boundaries) { + llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second); + } + } + } + } +}; + // // model loading and saving // @@ -17185,7 +17247,8 @@ static void llama_output_reorder(struct llama_context * ctx) { } } -static void llama_graph_compute( +// returns the result of ggml_backend_sched_graph_compute_async execution +static enum ggml_status llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, int n_threads, @@ -17200,15 +17263,20 @@ static void llama_graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); - if (err != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err); + auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); + + return status; } // decode a batch of tokens by evaluating the transformer +// in case of unsuccessful decoding (error or warning), +// the kv_cache state will be returned to its original state +// (for non-recurrent models) or cleaned (for recurrent models) // // - lctx: llama context // - batch: batch to evaluate @@ -17258,6 +17326,7 @@ static int llama_decode_internal( lctx.n_queued_tokens += n_tokens_all; auto & kv_self = lctx.kv_self; + llama_kv_slot_restorer kv_slot_restorer(kv_self); const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -17342,9 +17411,11 @@ static int llama_decode_internal( kv_self.head = 0; } - if (!llama_kv_cache_find_slot(kv_self, ubatch)) { + const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); + if (!slot) { return 1; } + kv_slot_restorer.save(slot); if (!kv_self.recurrent) { // a heuristic, to avoid attending the full cache if it is not yet utilized @@ -17391,7 +17462,19 @@ static int llama_decode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + if (compute_status != GGML_STATUS_SUCCESS) { + kv_slot_restorer.restore(kv_self); + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } // update the kv ring buffer { @@ -17628,7 +17711,18 @@ static int llama_encode_internal( llama_set_inputs(lctx, ubatch); - llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } // extract embeddings if (embd) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 65be43281..6618d03d1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -681,6 +681,7 @@ struct test_case { // run int64_t total_time_us = 0; + int64_t total_mem = 0; int total_runs = 0; do { int64_t start_time = ggml_time_us(); @@ -688,6 +689,7 @@ struct test_case { int64_t end_time = ggml_time_us(); total_time_us += end_time - start_time; + total_mem += mem; total_runs += n_runs; } while (total_time_us < 1000*1000); // run for at least 1 second @@ -717,7 +719,7 @@ struct test_case { } else { printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m", op_size(out) / 1024, - mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0); + total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0); } printf("\n"); @@ -2740,6 +2742,13 @@ struct test_flash_attn_ext : public test_case { return 5e-4; } + uint64_t op_flops(ggml_tensor * t) override { + GGML_UNUSED(t); + // Just counting matmul costs: + // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head + return 2 * 2 * nh * nb * hs * kv; + } + test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16) : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {} @@ -3779,6 +3788,8 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); + for (int bs : {1, 512}) { for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32}) {