diff --git a/Makefile b/Makefile index d2b2d14c1..28bc17b75 100644 --- a/Makefile +++ b/Makefile @@ -456,8 +456,14 @@ endif # LLAMA_CLBLAST ifdef LLAMA_VULKAN CFLAGS += -DGGML_USE_VULKAN CXXFLAGS += -DGGML_USE_VULKAN - LDFLAGS += -lvulkan + LDFLAGS += -lvulkan -lcblas OBJS += ggml-vulkan.o + +ifdef LLAMA_VULKAN_CHECK_RESULTS + CFLAGS += -DGGML_VULKAN_CHECK_RESULTS + CXXFLAGS += -DGGML_VULKAN_CHECK_RESULTS +endif + ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif # LLAMA_VULKAN diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 32a10f530..ce91da057 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -303,7 +303,7 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s static vk_pipeline ggml_vk_create_pipeline_from_file(const std::string& name, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { #ifdef VK_DEBUG - std::cerr << "ggml_vk_create_pipeline_from_file(" << path << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; + std::cerr << "ggml_vk_create_pipeline_from_file(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; #endif const std::string path = "vk_shaders/" + name + (vk_device.fp16 ? "" : "_fp32") + ".comp"; @@ -663,6 +663,8 @@ static inline bool ggml_vk_build_shader(ggml_type type) { case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: case GGML_TYPE_Q6_K: return true; default: @@ -681,25 +683,25 @@ static void ggml_vk_load_shaders() { auto warptile_s = { 32, 32, 32, 8, 32, 32, 2, 2, 2 }; vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline_from_file("matmul_f32_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline_from_file("matmul_f32_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline_from_file("matmul_f32_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline_from_file("matmul_f32_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline_from_file("matmul_f16_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline_from_file("matmul_f16_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline_from_file("matmul_f16_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline_from_file("matmul_f16_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline_from_file("matmul_f16_f32_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline_from_file("matmul_f16_f32_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); // Build dequant shaders @@ -737,7 +739,7 @@ static void ggml_vk_load_shaders() { void ggml_vk_test_transfer(size_t ne); void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size); void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size); -void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align); +void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size); void ggml_vk_init(void) { #ifdef VK_DEBUG @@ -894,10 +896,6 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; vk_fence = vk_device.device.createFence({}); #if defined(VK_CHK_KERNEL) - ggml_vk_test_buffer_write_zeropad(233, 97, 128); - ggml_vk_test_buffer_write_zeropad(233, 97, 1); - ggml_vk_test_buffer_write_zeropad(256, 128, 1); - int step = 16; for (size_t m = step; m < 64; m += step) { ggml_vk_test_transfer(1024 * 1024 * m); @@ -936,6 +934,14 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 1); ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 2); ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 2); + std::cerr << std::endl; + + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 0); + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 0); + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 1); + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 1); + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 2); + ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 2); std::cerr << std::endl << std::endl; } #endif @@ -952,6 +958,8 @@ static inline vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: case GGML_TYPE_Q6_K: break; default: @@ -972,6 +980,8 @@ static inline vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type, bo case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: case GGML_TYPE_Q6_K: break; default: @@ -2539,7 +2549,7 @@ void ggml_vk_build_graph(ggml_tensor * node){ } } -bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ +bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor){ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); @@ -2583,6 +2593,11 @@ bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_te return false; } + if (extra == nullptr) { + // Graph hasn't been prepared, fall back to CPU + return false; + } + if (params->ith != 0) { return true; } @@ -2590,7 +2605,9 @@ bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_te return true; } - GGML_ASSERT(extra); +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_0(params, tensor); +#endif // Do staging buffer copies for (auto& cpy : extra->memcpys) { @@ -2630,6 +2647,162 @@ void ggml_vk_graph_cleanup() { vk_gc.extras.clear(); } +#ifdef GGML_VULKAN_CHECK_RESULTS +void * comp_result; +void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) { + if (params->ith != 0) { + return; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_tensor * src0 = tensor->src[0]; + ggml_tensor * src1 = tensor->src[1]; + + struct ggml_init_params iparams = { + .mem_size = 512*1024*1024, + .mem_buffer = NULL, + }; + + // memory allocation happens here + struct ggml_context * ctx = ggml_init(iparams); + + struct ggml_tensor * src0_clone = nullptr; + struct ggml_tensor * src1_clone = nullptr; + struct ggml_tensor * tensor_clone = nullptr; + + size_t src0_size; + size_t src1_size; + + if (src0 != nullptr) { + src0_clone = ggml_dup_tensor(ctx, src0); + + // Some tensors have wrong strides for some reason + src0_size = src0->nb[1] * src0->ne[1] * src0->ne[2] * src0->ne[3]; + + src0_clone->data = malloc(src0_size); + if (src0->backend == GGML_BACKEND_CPU) { + memcpy(src0_clone->data, src0->data, src0_size); + } else if (src0->backend == GGML_BACKEND_GPU) { + ggml_vk_buffer_read((vk_buffer *)src0->data, 0, src0_clone->data, src0_size, vk_device.transfer_queues[0]); + } else { + GGML_ASSERT(false); + } + + memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); + + for (size_t i = 0; i < 4; i++) { + GGML_ASSERT(src0_clone->ne[i] == src0->ne[i]); + GGML_ASSERT(src0_clone->nb[i] == src0->nb[i]); + } + } + if (src1 != nullptr) { + src1_clone = ggml_dup_tensor(ctx, src1); + + src1_size = src1->ne[3] * src1->nb[3]; + + src1_clone->data = malloc(src1_size); + if (src1->backend == GGML_BACKEND_CPU) { + memcpy(src1_clone->data, src1->data, src1_size); + } else if (src1->backend == GGML_BACKEND_GPU) { + ggml_vk_buffer_read((vk_buffer *)src1->data, 0, src1_clone->data, src1_size, vk_device.transfer_queues[0]); + } else { + GGML_ASSERT(false); + } + + memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); + + for (size_t i = 0; i < 4; i++) { + GGML_ASSERT(src1_clone->ne[i] == src1->ne[i]); + GGML_ASSERT(src1_clone->nb[i] == src1->nb[i]); + } + } + + if (tensor->op == GGML_OP_MUL_MAT) { + tensor_clone = ggml_mul_mat(ctx, src0_clone, src1_clone); + } else if (tensor->op == GGML_OP_MUL) { + tensor_clone = ggml_mul(ctx, src0_clone, src1_clone); + } else { + GGML_ASSERT(false); + } + + struct ggml_cgraph cgraph = ggml_build_forward(tensor_clone); + + ggml_graph_compute_with_ctx(ctx, &cgraph, 8); + + size_t tensor_size = tensor_clone->ne[3] * tensor_clone->nb[3]; + + comp_result = malloc(tensor_size); + memcpy(comp_result, tensor_clone->data, tensor_size); + + free(src0_clone->data); + free(src1_clone->data); + + ggml_free(ctx); +} + +void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) { + if (params->ith != 0) { + return; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_tensor * src0 = tensor->src[0]; + ggml_tensor * src1 = tensor->src[1]; + + double avg_err = 0.0f; + + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + if (tensor->type == GGML_TYPE_F32) { + float correct = *(float *) ((char *) comp_result + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); + float result = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); + + if (std::isnan(correct) || std::isnan(result)) { + std::cerr << "ERROR: NaN value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << std::endl; + std::cerr << "tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << std::endl; + if (tensor->src[0] != nullptr) { + std::cerr << "src0 type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << std::endl; + } + if (tensor->src[1] != nullptr) { + std::cerr << "src1 type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << std::endl; + } + GGML_ASSERT(false); + } + + avg_err += std::fabs(correct - result); + } else { + GGML_ASSERT(false); + } + } + } + } + } + + avg_err /= tensor->ne[3] * tensor->ne[2] * tensor->ne[1] * tensor->ne[0]; + + if (avg_err > 1.0 || std::isnan(avg_err)) { + std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << std::endl; + std::cerr << "tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << std::endl; + if (tensor->src[0] != nullptr) { + std::cerr << "src0 type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << std::endl; + } + if (tensor->src[1] != nullptr) { + std::cerr << "src1 type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << std::endl; + } + GGML_ASSERT(false); + } + + free(comp_result); + comp_result = nullptr; +} +#endif + #ifdef VK_CHK_KERNEL void ggml_vk_test_transfer(size_t ne) { #ifdef VK_DEBUG @@ -2728,8 +2901,8 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp y[i] = rand() / (float)RAND_MAX; } - seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, sizeof(float) * p->align, vk_device.transfer_queues[0], {}, {})); - seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, sizeof(float) * p->align, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); @@ -2840,8 +3013,8 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); } - seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, sizeof(ggml_fp16_t) * p->align, vk_device.transfer_queues[0], {}, {})); - seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, sizeof(ggml_fp16_t) * p->align, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queues[0], {}, {})); ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); @@ -2906,72 +3079,119 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp free(d); } -void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align) { +void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size) { #ifdef VK_DEBUG - std::cerr << "ggml_vk_test_buffer_write_zeropad(" << m << ", " << k << ", " << align << ")" << std::endl; + std::cerr << "ggml_vk_test_matmul_f16(" << m << ", " << n << ", " << k << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl; #endif + if (!vk_device.fp16) { + return; + } + const size_t x_ne = m * k; + const size_t y_ne = k * n; + const size_t d_ne = m * n; + std::vector seq; - const size_t kpad = ggml_vk_align_size(k, align); - - vk_buffer d_X; - ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * m, &d_X, {}); - vk_buffer d_X2; - ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * k * m, &d_X2, {}); - - ggml_fp16_t* x = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * m * k); - - for (size_t i = 0; i < m * k; i++) { - x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); + vk_pipeline * p; + std::string shname; + if (shader_size == 0) { + p = &vk_pipeline_matmul_f16_f32_s; + shname = "F16_F32_S"; + } else if (shader_size == 1) { + p = &vk_pipeline_matmul_f16_f32_m; + shname = "F16_F32_M"; + } else if (shader_size == 2) { + p = &vk_pipeline_matmul_f16_f32_l; + shname = "F16_F32_L"; + } else { + GGML_ASSERT(0); } - seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, sizeof(ggml_fp16_t) * align, vk_device.transfer_queues[0], {}, {})); + const size_t kpad = ggml_vk_align_size(k, p->align); + + ggml_vk_pipeline_allocate_descriptor_sets(*p, num_it); + if (split_k > 1) { + ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, num_it); + } + + vk_buffer d_X; + vk_buffer d_Y; + vk_buffer d_D; + ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * m, &d_X, {}); + ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * n, &d_Y, {}); + ggml_vk_pool_malloc(sizeof(float) * d_ne * split_k, &d_D, {}); + + ggml_fp16_t* x = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * x_ne); + float* y = (float *) malloc(sizeof(float) * y_ne); + float* d = (float *) malloc(sizeof(float) * d_ne); + + for (size_t i = 0; i < x_ne; i++) { + x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); + } + for (size_t i = 0; i < y_ne; i++) { + y[i] = rand() / (float)RAND_MAX; + } + + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); - ggml_vk_buffer_write(&d_X2, 0, x, sizeof(ggml_fp16_t) * k * m, vk_device.transfer_queues[0]); - + // Wait for transfers to finish vk_device.transfer_queues[0].queue.waitIdle(); - ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * kpad * m); - ggml_fp16_t * x_chk2 = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * k * m); + auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_read(&d_X, 0, x_chk, sizeof(ggml_fp16_t) * kpad * m, vk_device.transfer_queues[0]); - ggml_vk_buffer_read(&d_X2, 0, x_chk2, sizeof(ggml_fp16_t) * k * m, vk_device.transfer_queues[0]); + for (size_t i = 0; i < num_it; i++) { + seq.push_back(ggml_vk_matmul(*p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), m, n, k, kpad, kpad, m, split_k, vk_device.compute_queue, {}, {})); + } - double avg_err_async = 0.0; - double avg_err_sync = 0.0; + ggml_vk_submit(vk_device.compute_queue, seq, VK_NULL_HANDLE); - for (size_t kidx = 0; kidx < kpad; kidx++) { - for (size_t midx = 0; midx < m; midx++) { - if (kidx < k) { - const float err = std::fabs(ggml_fp16_to_fp32(x[midx * k + kidx]) - ggml_fp16_to_fp32(x_chk[midx * kpad + kidx])); - const float err2 = std::fabs(ggml_fp16_to_fp32(x[midx * k + kidx]) - ggml_fp16_to_fp32(x_chk2[midx * k + kidx])); - if (!std::isnan(err)) { - avg_err_async += err; - } - if (!std::isnan(err2)) { - avg_err_sync += err; - } + vk_device.compute_queue.queue.waitIdle(); - if (err > 0.01f) { - std::cerr << "midx=" << midx << " kidx=" << kidx << " x: " << ggml_fp16_to_fp32(x[midx * k + kidx]) << " x_chk: " << ggml_fp16_to_fp32(x_chk[midx * kpad + kidx]) << " x_chk2: " << ggml_fp16_to_fp32(x_chk2[midx * k + kidx]) << std::endl; - } - } else { - const float val = std::fabs(ggml_fp16_to_fp32(x_chk[midx * kpad + kidx])); - if (val > 0.01f) { - std::cerr << "ZEROPAD ERROR midx=" << midx << " kidx=" << kidx << " src0: 0.0 x_chkidx: " << val << std::endl; - GGML_ASSERT(false); - } - avg_err_async += val; - } + auto end = std::chrono::high_resolution_clock::now(); + + // copy dst to host + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); + + float * fx = (float *) malloc(sizeof(float) * x_ne); + float * d_chk = (float *) malloc(sizeof(float) * d_ne); + + ggml_fp16_to_fp32_row(x, fx, x_ne); + + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, + m, n, k, + 1.0f, fx, k, + y, k, + 0.0f, d_chk, m); + + double avg_err = 0.0; + + for (size_t r = 0; r < m; r++) { + for (size_t c = 0; c < n; c++) { + avg_err += std::fabs(d[c * m + r] - d_chk[c * m + r]); } } - std::cerr << "TEST BUFFER WRITE ZEROPAD m=" << m << " k=" << k << " align=" << align << " avg_err_async=" << avg_err_async / (kpad * m) << " avg_err_sync=" << avg_err_sync / (k * m) << std::endl; + std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " split_k=" << split_k << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 / num_it << "ms avg_err=" << avg_err / (m * n) << std::endl; + + free(fx); + free(d_chk); + + ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); + ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); + ggml_vk_queue_cleanup(vk_device.compute_queue); - free(x_chk); - ggml_vk_host_free(x); ggml_vk_pool_free(d_X); + ggml_vk_pool_free(d_Y); + ggml_vk_pool_free(d_D); + + ggml_vk_pipeline_cleanup(*p); + ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce); + + free(x); + free(y); + free(d); } #endif diff --git a/ggml-vulkan.h b/ggml-vulkan.h index e25214107..61648ba3e 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -12,6 +12,10 @@ void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); void ggml_vk_preallocate_buffers(void); void ggml_vk_build_graph(struct ggml_tensor * node); bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +#ifdef GGML_VULKAN_CHECK_RESULTS +void ggml_vk_check_results_0(struct ggml_compute_params * params, struct ggml_tensor * tensor); +void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor); +#endif void ggml_vk_graph_cleanup(void); void * ggml_vk_host_malloc(size_t size); diff --git a/ggml.c b/ggml.c index b2b4cfacb..7091afd27 100644 --- a/ggml.c +++ b/ggml.c @@ -16615,6 +16615,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #elif defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward(params, tensor); +#ifdef GGML_VULKAN_CHECK_RESULTS + if (skip_cpu) { + ggml_vk_check_results_1(params, tensor); + } +#endif if (skip_cpu) { return; } diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 8078a5d4a..b245515ac 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -90,6 +90,32 @@ struct block_q8_0 #define A_TYPE block_q8_0 """ +# K-quants +shader_q2_K_defines = """ +#define QUANT_K 256 + +struct block_q2_K +{ + uint8_t scales[QUANT_K/16]; + uint8_t qs[QUANT_K/4]; + f16vec2 d; +}; + +#define A_TYPE block_q2_K +""" +shader_q3_K_defines = """ +#define QUANT_K 256 + +struct block_q3_K +{ + uint8_t hmask[QUANT_K/8]; + uint8_t qs[QUANT_K/4]; + uint8_t scales[12]; + float16_t d; +}; + +#define A_TYPE block_q3_K +""" shader_q6_K_defines = """ #define QUANT_K 256 @@ -410,7 +436,6 @@ dequant_head = """#version 450 #extension GL_EXT_control_flow_attributes : require #extension GL_EXT_shader_16bit_storage : require -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require """ dequant_body = """ @@ -436,7 +461,7 @@ void main() { if (row * QUANT_K >= p.K || col >= p.M) { return; - } + } const int stride_a = p.stride_a / QUANT_K; @@ -450,11 +475,99 @@ void main() { y[col * p.stride_b + row*QUANT_K + iqs + 0 ] = D_TYPE(v.x); y[col * p.stride_b + row*QUANT_K + iqs + y_offset] = D_TYPE(v.y); - } + } } """ # K-quants +dequant_q2_K_body = """ +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE x[];}; +layout (binding = 1) writeonly buffer D {D_TYPE y[];}; + +layout (push_constant) uniform parameter +{ + int M; + int K; + int stride_a; + int stride_b; +} p; + +void main() { + [[unroll]] for (int wgy = 0; wgy < 256; wgy++) { + const int i = int(gl_WorkGroupID.x * 256 + wgy); + if (i >= p.M * p.K / QUANT_K) { + return; + } + + const int tid = int(gl_LocalInvocationID.x); + const int ip = tid / 32; + const int il = tid - 32 * ip; + const int is = 8 * ip + il / 16; + + const int y_idx = i * QUANT_K + 128 * ip + il; + + const int ql_idx = 32 * ip + il; + const uint8_t qs = x[i].qs[32 * ip + il]; + + FLOAT_TYPE dall = FLOAT_TYPE(x[i].d.x); + FLOAT_TYPE dmin = FLOAT_TYPE(x[i].d.y); + y[y_idx + 0] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+0] >> 4)); + y[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+2] >> 4)); + y[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+4] >> 4)); + y[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+6] >> 4)); + } +} +""" +dequant_q3_K_body = """ +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE x[];}; +layout (binding = 1) writeonly buffer D {D_TYPE y[];}; + +layout (push_constant) uniform parameter +{ + int M; + int K; + int stride_a; + int stride_b; +} p; + +void main() { + [[unroll]] for (int wgy = 0; wgy < 256; wgy++) { + const int i = int(gl_WorkGroupID.x * 256 + wgy); + if (i >= p.M * p.K / QUANT_K) { + return; + } + + const int r = int(gl_LocalInvocationID.x) / 4; + const int tid = r / 2; + const int is0 = r % 2; + const int l0 = 16 * is0 + 4 * (int(gl_LocalInvocationID.x) % 4); + const int n = tid / 4; + const int j = tid - 4*n; + + const uint8_t m = uint8_t(1 << (4*n + j)); + const int is = 8*n + 2*j + is0; + const int shift = 2*j; + + const int8_t us = int8_t(is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4)); + const FLOAT_TYPE d_all = FLOAT_TYPE(x[i].d); + const FLOAT_TYPE dl = d_all * FLOAT_TYPE(us - 32); + + const int y_idx = i * QUANT_K + 128 * n + 32 * j; + const int qs_idx = 32*n; + + for (int l = l0; l < l0 + 4; ++l) { + y[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((x[i].qs[qs_idx + l] >> shift) & 3) - (((x[i].hmask[l] & m) != 0) ? 0 : 4))); + } + } +} +""" dequant_q6_K_body = """ layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; @@ -470,11 +583,11 @@ layout (push_constant) uniform parameter } p; void main() { - for (int wgy = 0; wgy < 256; wgy++) { + [[unroll]] for (int wgy = 0; wgy < 256; wgy++) { const int i = int(gl_WorkGroupID.x * 256 + wgy); if (i >= p.M * p.K / QUANT_K) { return; - } + } const int tid = int(gl_LocalInvocationID.x); const int ip = tid / 32; const int il = tid - 32 * ip; @@ -491,7 +604,7 @@ void main() { y[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 2] * (int8_t((x[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))); y[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 4] * (int8_t((x[i].ql[ql_idx + 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32))); y[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 6] * (int8_t((x[i].ql[ql_idx + 32] >> 4) | (((qh >> 6) & 3) << 4)) - 32))); - } + } } """ @@ -553,6 +666,154 @@ void main() { } """ +# K-quants +mul_mat_vec_q2_K_body = """ +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE x[];}; +layout (binding = 1) readonly buffer B {B_TYPE y[];}; +layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; + +layout (push_constant) uniform parameter +{ + int ncols; +} p; + +shared FLOAT_TYPE tmp[32]; + +void main() { + const int row = int(gl_WorkGroupID.x); + + const int num_blocks_per_row = p.ncols / QUANT_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int v_in = tid - step*v_im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const int q_offset = 32*v_im + l0; + const int s_offset = 8*v_im; + const int y_offset = 128*v_im + l0; + + tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + + [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + const int y_idx = i * QUANT_K + y_offset; + + const FLOAT_TYPE dall = FLOAT_TYPE(x[ib0 + i].d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(x[ib0 + i].d.y); + + FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); + FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 0) & 3) + + FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 0) & 3) + + FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 2) & 3) + + FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 2) & 3) + + FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 4) & 3) + + FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 4) & 3) + + FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 6) & 3) + + FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 6) & 3); + sum2 += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 0] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 1] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 2] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 3] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 4] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 5] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 6] >> 4) & 0xF) + + FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 7] >> 4) & 0xF); + } + tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s = 16; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + if (tid == 0) { + dst[row] = D_TYPE(tmp[0]); + } +} +""" +mul_mat_vec_q3_K_body = """ +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE x[];}; +layout (binding = 1) readonly buffer B {B_TYPE y[];}; +layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; + +layout (push_constant) uniform parameter +{ + int ncols; +} p; + +shared FLOAT_TYPE tmp[32]; + +void main() { + const int row = int(gl_WorkGroupID.x); + + const int num_blocks_per_row = p.ncols / QUANT_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int v_in = tid - step*v_im; // 0...15 or 0...7 + + const uint8_t m = uint8_t(1 << (4 * v_im)); + + const int l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const int q_offset = 32*v_im + l0; + const int y_offset = 128*v_im + l0; + + tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + + const uint s_shift = 4 * v_im; + + [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + const int y_idx = i * QUANT_K + y_offset; + + const FLOAT_TYPE d = FLOAT_TYPE(x[ib0 + i].d); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0); + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[0] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] ) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[2] >> s_shift) & 0xF) | ((x[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[4] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[6] >> s_shift) & 0xF) | ((x[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[1] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] ) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[3] >> s_shift) & 0xF) | ((x[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[5] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)) + + FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[7] >> s_shift) & 0xF) | ((x[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)); + } + tmp[16 * ix + tid] += d * sum; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s = 16; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + if (tid == 0) { + dst[row] = D_TYPE(tmp[0]); + } +} +""" mul_mat_vec_q6_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -596,7 +857,7 @@ void main() { tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const int y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(x[ib0 + i].d); @@ -618,10 +879,10 @@ void main() { + FLOAT_TYPE(y[y_idx + l+32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32) + FLOAT_TYPE(y[y_idx + l+64]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((x[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32) + FLOAT_TYPE(y[y_idx + l+96]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] >> 4) | (((x[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32); - } + } tmp[16 * ix + tid] += sum; #endif - } + } // sum up partial sums and write back result barrier(); @@ -843,8 +1104,8 @@ async def string_to_spv_file(name, code, defines, fp16): preprocessed_code = stdout.decode() cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) - code_with_lines = "\n".join([f"{i}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) - print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error=}") + code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) + print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}") f.close() os.remove(f.name) sys.exit(proc.returncode) @@ -919,6 +1180,10 @@ async def main(): stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body)) elif i == GGML_TYPE_Q8_0: stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q2_K: + stream.extend((shader_q2_K_defines, dequant_q2_K_body)) + elif i == GGML_TYPE_Q3_K: + stream.extend((shader_q3_K_defines, dequant_q3_K_body)) elif i == GGML_TYPE_Q6_K: stream.extend((shader_q6_K_defines, dequant_q6_K_body)) else: @@ -943,6 +1208,10 @@ async def main(): stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q8_0: stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, mul_mat_vec_body)) + elif i == GGML_TYPE_Q2_K: + stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body)) + elif i == GGML_TYPE_Q3_K: + stream.extend((shader_q3_K_defines, mul_mat_vec_q3_K_body)) elif i == GGML_TYPE_Q6_K: stream.extend((shader_q6_K_defines, mul_mat_vec_q6_K_body)) else: