Add q2_k and q3_k support
Add validation check to compare shader results to cpu results
This commit is contained in:
parent
e90a6515dd
commit
a861879256
5 changed files with 581 additions and 77 deletions
8
Makefile
8
Makefile
|
@ -456,8 +456,14 @@ endif # LLAMA_CLBLAST
|
|||
ifdef LLAMA_VULKAN
|
||||
CFLAGS += -DGGML_USE_VULKAN
|
||||
CXXFLAGS += -DGGML_USE_VULKAN
|
||||
LDFLAGS += -lvulkan
|
||||
LDFLAGS += -lvulkan -lcblas
|
||||
OBJS += ggml-vulkan.o
|
||||
|
||||
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||
CFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
CXXFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
endif
|
||||
|
||||
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_VULKAN
|
||||
|
|
350
ggml-vulkan.cpp
350
ggml-vulkan.cpp
|
@ -303,7 +303,7 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s
|
|||
|
||||
static vk_pipeline ggml_vk_create_pipeline_from_file(const std::string& name, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<int>&& specialization_constants, uint32_t align) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_create_pipeline_from_file(" << path << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
||||
std::cerr << "ggml_vk_create_pipeline_from_file(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
const std::string path = "vk_shaders/" + name + (vk_device.fp16 ? "" : "_fp32") + ".comp";
|
||||
|
@ -663,6 +663,8 @@ static inline bool ggml_vk_build_shader(ggml_type type) {
|
|||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
default:
|
||||
|
@ -681,25 +683,25 @@ static void ggml_vk_load_shaders() {
|
|||
auto warptile_s = { 32, 32, 32, 8, 32, 32, 2, 2, 2 };
|
||||
|
||||
vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline_from_file("matmul_f32_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline_from_file("matmul_f32_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline_from_file("matmul_f32_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline_from_file("matmul_f32_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f32_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
|
||||
vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline_from_file("matmul_f16_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline_from_file("matmul_f16_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline_from_file("matmul_f16_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline_from_file("matmul_f16_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
|
||||
vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f16_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
|
||||
vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline_from_file("matmul_f16_f32_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline_from_file("matmul_f16_f32_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_l", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
|
||||
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_m", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_m", "main", 3, 7 * sizeof(int), {64, 64, 1}, warptile_m, 64);
|
||||
vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline_from_file("matmul_f16_f32_aligned_s", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32);
|
||||
|
||||
// Build dequant shaders
|
||||
|
@ -737,7 +739,7 @@ static void ggml_vk_load_shaders() {
|
|||
void ggml_vk_test_transfer(size_t ne);
|
||||
void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||
void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||
void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align);
|
||||
void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||
|
||||
void ggml_vk_init(void) {
|
||||
#ifdef VK_DEBUG
|
||||
|
@ -894,10 +896,6 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
|||
vk_fence = vk_device.device.createFence({});
|
||||
|
||||
#if defined(VK_CHK_KERNEL)
|
||||
ggml_vk_test_buffer_write_zeropad(233, 97, 128);
|
||||
ggml_vk_test_buffer_write_zeropad(233, 97, 1);
|
||||
ggml_vk_test_buffer_write_zeropad(256, 128, 1);
|
||||
|
||||
int step = 16;
|
||||
for (size_t m = step; m < 64; m += step) {
|
||||
ggml_vk_test_transfer(1024 * 1024 * m);
|
||||
|
@ -936,6 +934,14 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
|||
ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 1);
|
||||
ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 2);
|
||||
ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 2);
|
||||
std::cerr << std::endl;
|
||||
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 0);
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 0);
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 1);
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 1);
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 1, 2);
|
||||
ggml_vk_test_matmul_f16_f32(vals[i], vals[i + 1], vals[i + 2], 1000, 4, 2);
|
||||
std::cerr << std::endl << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
@ -952,6 +958,8 @@ static inline vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) {
|
|||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
break;
|
||||
default:
|
||||
|
@ -972,6 +980,8 @@ static inline vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type, bo
|
|||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
break;
|
||||
default:
|
||||
|
@ -2539,7 +2549,7 @@ void ggml_vk_build_graph(ggml_tensor * node){
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||
bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor){
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
|
@ -2583,6 +2593,11 @@ bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_te
|
|||
return false;
|
||||
}
|
||||
|
||||
if (extra == nullptr) {
|
||||
// Graph hasn't been prepared, fall back to CPU
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
|
@ -2590,7 +2605,9 @@ bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_te
|
|||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(extra);
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_0(params, tensor);
|
||||
#endif
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : extra->memcpys) {
|
||||
|
@ -2630,6 +2647,162 @@ void ggml_vk_graph_cleanup() {
|
|||
vk_gc.extras.clear();
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void * comp_result;
|
||||
void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
.mem_size = 512*1024*1024,
|
||||
.mem_buffer = NULL,
|
||||
};
|
||||
|
||||
// memory allocation happens here
|
||||
struct ggml_context * ctx = ggml_init(iparams);
|
||||
|
||||
struct ggml_tensor * src0_clone = nullptr;
|
||||
struct ggml_tensor * src1_clone = nullptr;
|
||||
struct ggml_tensor * tensor_clone = nullptr;
|
||||
|
||||
size_t src0_size;
|
||||
size_t src1_size;
|
||||
|
||||
if (src0 != nullptr) {
|
||||
src0_clone = ggml_dup_tensor(ctx, src0);
|
||||
|
||||
// Some tensors have wrong strides for some reason
|
||||
src0_size = src0->nb[1] * src0->ne[1] * src0->ne[2] * src0->ne[3];
|
||||
|
||||
src0_clone->data = malloc(src0_size);
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
memcpy(src0_clone->data, src0->data, src0_size);
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
ggml_vk_buffer_read((vk_buffer *)src0->data, 0, src0_clone->data, src0_size, vk_device.transfer_queues[0]);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
GGML_ASSERT(src0_clone->ne[i] == src0->ne[i]);
|
||||
GGML_ASSERT(src0_clone->nb[i] == src0->nb[i]);
|
||||
}
|
||||
}
|
||||
if (src1 != nullptr) {
|
||||
src1_clone = ggml_dup_tensor(ctx, src1);
|
||||
|
||||
src1_size = src1->ne[3] * src1->nb[3];
|
||||
|
||||
src1_clone->data = malloc(src1_size);
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
memcpy(src1_clone->data, src1->data, src1_size);
|
||||
} else if (src1->backend == GGML_BACKEND_GPU) {
|
||||
ggml_vk_buffer_read((vk_buffer *)src1->data, 0, src1_clone->data, src1_size, vk_device.transfer_queues[0]);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
GGML_ASSERT(src1_clone->ne[i] == src1->ne[i]);
|
||||
GGML_ASSERT(src1_clone->nb[i] == src1->nb[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ctx, src0_clone, src1_clone);
|
||||
} else if (tensor->op == GGML_OP_MUL) {
|
||||
tensor_clone = ggml_mul(ctx, src0_clone, src1_clone);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
struct ggml_cgraph cgraph = ggml_build_forward(tensor_clone);
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx, &cgraph, 8);
|
||||
|
||||
size_t tensor_size = tensor_clone->ne[3] * tensor_clone->nb[3];
|
||||
|
||||
comp_result = malloc(tensor_size);
|
||||
memcpy(comp_result, tensor_clone->data, tensor_size);
|
||||
|
||||
free(src0_clone->data);
|
||||
free(src1_clone->data);
|
||||
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
double avg_err = 0.0f;
|
||||
|
||||
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
float correct = *(float *) ((char *) comp_result + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||
float result = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||
|
||||
if (std::isnan(correct) || std::isnan(result)) {
|
||||
std::cerr << "ERROR: NaN value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << std::endl;
|
||||
std::cerr << "tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << std::endl;
|
||||
if (tensor->src[0] != nullptr) {
|
||||
std::cerr << "src0 type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << std::endl;
|
||||
}
|
||||
if (tensor->src[1] != nullptr) {
|
||||
std::cerr << "src1 type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << std::endl;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
avg_err += std::fabs(correct - result);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
avg_err /= tensor->ne[3] * tensor->ne[2] * tensor->ne[1] * tensor->ne[0];
|
||||
|
||||
if (avg_err > 1.0 || std::isnan(avg_err)) {
|
||||
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << std::endl;
|
||||
std::cerr << "tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << std::endl;
|
||||
if (tensor->src[0] != nullptr) {
|
||||
std::cerr << "src0 type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << std::endl;
|
||||
}
|
||||
if (tensor->src[1] != nullptr) {
|
||||
std::cerr << "src1 type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << std::endl;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
free(comp_result);
|
||||
comp_result = nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef VK_CHK_KERNEL
|
||||
void ggml_vk_test_transfer(size_t ne) {
|
||||
#ifdef VK_DEBUG
|
||||
|
@ -2728,8 +2901,8 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
|
|||
y[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, sizeof(float) * p->align, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, sizeof(float) * p->align, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {}));
|
||||
|
||||
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE);
|
||||
|
||||
|
@ -2840,8 +3013,8 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp
|
|||
y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
|
||||
}
|
||||
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, sizeof(ggml_fp16_t) * p->align, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, sizeof(ggml_fp16_t) * p->align, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queues[0], {}, {}));
|
||||
|
||||
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE);
|
||||
|
||||
|
@ -2906,72 +3079,119 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp
|
|||
free(d);
|
||||
}
|
||||
|
||||
void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align) {
|
||||
void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_test_buffer_write_zeropad(" << m << ", " << k << ", " << align << ")" << std::endl;
|
||||
std::cerr << "ggml_vk_test_matmul_f16(" << m << ", " << n << ", " << k << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
|
||||
#endif
|
||||
if (!vk_device.fp16) {
|
||||
return;
|
||||
}
|
||||
const size_t x_ne = m * k;
|
||||
const size_t y_ne = k * n;
|
||||
const size_t d_ne = m * n;
|
||||
|
||||
std::vector<vk_sequence> seq;
|
||||
|
||||
const size_t kpad = ggml_vk_align_size(k, align);
|
||||
|
||||
vk_buffer d_X;
|
||||
ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * m, &d_X, {});
|
||||
vk_buffer d_X2;
|
||||
ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * k * m, &d_X2, {});
|
||||
|
||||
ggml_fp16_t* x = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * m * k);
|
||||
|
||||
for (size_t i = 0; i < m * k; i++) {
|
||||
x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
|
||||
vk_pipeline * p;
|
||||
std::string shname;
|
||||
if (shader_size == 0) {
|
||||
p = &vk_pipeline_matmul_f16_f32_s;
|
||||
shname = "F16_F32_S";
|
||||
} else if (shader_size == 1) {
|
||||
p = &vk_pipeline_matmul_f16_f32_m;
|
||||
shname = "F16_F32_M";
|
||||
} else if (shader_size == 2) {
|
||||
p = &vk_pipeline_matmul_f16_f32_l;
|
||||
shname = "F16_F32_L";
|
||||
} else {
|
||||
GGML_ASSERT(0);
|
||||
}
|
||||
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async_zeropad(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, sizeof(ggml_fp16_t) * align, vk_device.transfer_queues[0], {}, {}));
|
||||
const size_t kpad = ggml_vk_align_size(k, p->align);
|
||||
|
||||
ggml_vk_pipeline_allocate_descriptor_sets(*p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, num_it);
|
||||
}
|
||||
|
||||
vk_buffer d_X;
|
||||
vk_buffer d_Y;
|
||||
vk_buffer d_D;
|
||||
ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * m, &d_X, {});
|
||||
ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * kpad * n, &d_Y, {});
|
||||
ggml_vk_pool_malloc(sizeof(float) * d_ne * split_k, &d_D, {});
|
||||
|
||||
ggml_fp16_t* x = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * x_ne);
|
||||
float* y = (float *) malloc(sizeof(float) * y_ne);
|
||||
float* d = (float *) malloc(sizeof(float) * d_ne);
|
||||
|
||||
for (size_t i = 0; i < x_ne; i++) {
|
||||
x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
|
||||
}
|
||||
for (size_t i = 0; i < y_ne; i++) {
|
||||
y[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {}));
|
||||
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {}));
|
||||
|
||||
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE);
|
||||
|
||||
ggml_vk_buffer_write(&d_X2, 0, x, sizeof(ggml_fp16_t) * k * m, vk_device.transfer_queues[0]);
|
||||
|
||||
// Wait for transfers to finish
|
||||
vk_device.transfer_queues[0].queue.waitIdle();
|
||||
|
||||
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * kpad * m);
|
||||
ggml_fp16_t * x_chk2 = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * k * m);
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ggml_vk_buffer_read(&d_X, 0, x_chk, sizeof(ggml_fp16_t) * kpad * m, vk_device.transfer_queues[0]);
|
||||
ggml_vk_buffer_read(&d_X2, 0, x_chk2, sizeof(ggml_fp16_t) * k * m, vk_device.transfer_queues[0]);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
seq.push_back(ggml_vk_matmul(*p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), m, n, k, kpad, kpad, m, split_k, vk_device.compute_queue, {}, {}));
|
||||
}
|
||||
|
||||
double avg_err_async = 0.0;
|
||||
double avg_err_sync = 0.0;
|
||||
ggml_vk_submit(vk_device.compute_queue, seq, VK_NULL_HANDLE);
|
||||
|
||||
for (size_t kidx = 0; kidx < kpad; kidx++) {
|
||||
for (size_t midx = 0; midx < m; midx++) {
|
||||
if (kidx < k) {
|
||||
const float err = std::fabs(ggml_fp16_to_fp32(x[midx * k + kidx]) - ggml_fp16_to_fp32(x_chk[midx * kpad + kidx]));
|
||||
const float err2 = std::fabs(ggml_fp16_to_fp32(x[midx * k + kidx]) - ggml_fp16_to_fp32(x_chk2[midx * k + kidx]));
|
||||
if (!std::isnan(err)) {
|
||||
avg_err_async += err;
|
||||
}
|
||||
if (!std::isnan(err2)) {
|
||||
avg_err_sync += err;
|
||||
}
|
||||
vk_device.compute_queue.queue.waitIdle();
|
||||
|
||||
if (err > 0.01f) {
|
||||
std::cerr << "midx=" << midx << " kidx=" << kidx << " x: " << ggml_fp16_to_fp32(x[midx * k + kidx]) << " x_chk: " << ggml_fp16_to_fp32(x_chk[midx * kpad + kidx]) << " x_chk2: " << ggml_fp16_to_fp32(x_chk2[midx * k + kidx]) << std::endl;
|
||||
}
|
||||
} else {
|
||||
const float val = std::fabs(ggml_fp16_to_fp32(x_chk[midx * kpad + kidx]));
|
||||
if (val > 0.01f) {
|
||||
std::cerr << "ZEROPAD ERROR midx=" << midx << " kidx=" << kidx << " src0: 0.0 x_chkidx: " << val << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
avg_err_async += val;
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]);
|
||||
|
||||
float * fx = (float *) malloc(sizeof(float) * x_ne);
|
||||
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
||||
|
||||
ggml_fp16_to_fp32_row(x, fx, x_ne);
|
||||
|
||||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
m, n, k,
|
||||
1.0f, fx, k,
|
||||
y, k,
|
||||
0.0f, d_chk, m);
|
||||
|
||||
double avg_err = 0.0;
|
||||
|
||||
for (size_t r = 0; r < m; r++) {
|
||||
for (size_t c = 0; c < n; c++) {
|
||||
avg_err += std::fabs(d[c * m + r] - d_chk[c * m + r]);
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "TEST BUFFER WRITE ZEROPAD m=" << m << " k=" << k << " align=" << align << " avg_err_async=" << avg_err_async / (kpad * m) << " avg_err_sync=" << avg_err_sync / (k * m) << std::endl;
|
||||
std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " split_k=" << split_k << " matmul " << std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0 / num_it << "ms avg_err=" << avg_err / (m * n) << std::endl;
|
||||
|
||||
free(fx);
|
||||
free(d_chk);
|
||||
|
||||
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]);
|
||||
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
|
||||
ggml_vk_queue_cleanup(vk_device.compute_queue);
|
||||
|
||||
free(x_chk);
|
||||
ggml_vk_host_free(x);
|
||||
ggml_vk_pool_free(d_X);
|
||||
ggml_vk_pool_free(d_Y);
|
||||
ggml_vk_pool_free(d_D);
|
||||
|
||||
ggml_vk_pipeline_cleanup(*p);
|
||||
ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(d);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -12,6 +12,10 @@ void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
|
|||
void ggml_vk_preallocate_buffers(void);
|
||||
void ggml_vk_build_graph(struct ggml_tensor * node);
|
||||
bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_check_results_0(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
#endif
|
||||
void ggml_vk_graph_cleanup(void);
|
||||
|
||||
void * ggml_vk_host_malloc(size_t size);
|
||||
|
|
5
ggml.c
5
ggml.c
|
@ -16615,6 +16615,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
if (skip_cpu) {
|
||||
ggml_vk_check_results_1(params, tensor);
|
||||
}
|
||||
#endif
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -90,6 +90,32 @@ struct block_q8_0
|
|||
#define A_TYPE block_q8_0
|
||||
"""
|
||||
|
||||
# K-quants
|
||||
shader_q2_K_defines = """
|
||||
#define QUANT_K 256
|
||||
|
||||
struct block_q2_K
|
||||
{
|
||||
uint8_t scales[QUANT_K/16];
|
||||
uint8_t qs[QUANT_K/4];
|
||||
f16vec2 d;
|
||||
};
|
||||
|
||||
#define A_TYPE block_q2_K
|
||||
"""
|
||||
shader_q3_K_defines = """
|
||||
#define QUANT_K 256
|
||||
|
||||
struct block_q3_K
|
||||
{
|
||||
uint8_t hmask[QUANT_K/8];
|
||||
uint8_t qs[QUANT_K/4];
|
||||
uint8_t scales[12];
|
||||
float16_t d;
|
||||
};
|
||||
|
||||
#define A_TYPE block_q3_K
|
||||
"""
|
||||
shader_q6_K_defines = """
|
||||
#define QUANT_K 256
|
||||
|
||||
|
@ -410,7 +436,6 @@ dequant_head = """#version 450
|
|||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
"""
|
||||
|
||||
dequant_body = """
|
||||
|
@ -436,7 +461,7 @@ void main() {
|
|||
|
||||
if (row * QUANT_K >= p.K || col >= p.M) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const int stride_a = p.stride_a / QUANT_K;
|
||||
|
||||
|
@ -450,11 +475,99 @@ void main() {
|
|||
|
||||
y[col * p.stride_b + row*QUANT_K + iqs + 0 ] = D_TYPE(v.x);
|
||||
y[col * p.stride_b + row*QUANT_K + iqs + y_offset] = D_TYPE(v.y);
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
# K-quants
|
||||
dequant_q2_K_body = """
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE x[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE y[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int M;
|
||||
int K;
|
||||
int stride_a;
|
||||
int stride_b;
|
||||
} p;
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
|
||||
const int i = int(gl_WorkGroupID.x * 256 + wgy);
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int tid = int(gl_LocalInvocationID.x);
|
||||
const int ip = tid / 32;
|
||||
const int il = tid - 32 * ip;
|
||||
const int is = 8 * ip + il / 16;
|
||||
|
||||
const int y_idx = i * QUANT_K + 128 * ip + il;
|
||||
|
||||
const int ql_idx = 32 * ip + il;
|
||||
const uint8_t qs = x[i].qs[32 * ip + il];
|
||||
|
||||
FLOAT_TYPE dall = FLOAT_TYPE(x[i].d.x);
|
||||
FLOAT_TYPE dmin = FLOAT_TYPE(x[i].d.y);
|
||||
y[y_idx + 0] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+0] >> 4));
|
||||
y[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+2] >> 4));
|
||||
y[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+4] >> 4));
|
||||
y[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((x[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(x[i].scales[is+6] >> 4));
|
||||
}
|
||||
}
|
||||
"""
|
||||
dequant_q3_K_body = """
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE x[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE y[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int M;
|
||||
int K;
|
||||
int stride_a;
|
||||
int stride_b;
|
||||
} p;
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
|
||||
const int i = int(gl_WorkGroupID.x * 256 + wgy);
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int r = int(gl_LocalInvocationID.x) / 4;
|
||||
const int tid = r / 2;
|
||||
const int is0 = r % 2;
|
||||
const int l0 = 16 * is0 + 4 * (int(gl_LocalInvocationID.x) % 4);
|
||||
const int n = tid / 4;
|
||||
const int j = tid - 4*n;
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4*n + j));
|
||||
const int is = 8*n + 2*j + is0;
|
||||
const int shift = 2*j;
|
||||
|
||||
const int8_t us = int8_t(is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
|
||||
is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
|
||||
is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
|
||||
(x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4));
|
||||
const FLOAT_TYPE d_all = FLOAT_TYPE(x[i].d);
|
||||
const FLOAT_TYPE dl = d_all * FLOAT_TYPE(us - 32);
|
||||
|
||||
const int y_idx = i * QUANT_K + 128 * n + 32 * j;
|
||||
const int qs_idx = 32*n;
|
||||
|
||||
for (int l = l0; l < l0 + 4; ++l) {
|
||||
y[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((x[i].qs[qs_idx + l] >> shift) & 3) - (((x[i].hmask[l] & m) != 0) ? 0 : 4)));
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
dequant_q6_K_body = """
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
|
@ -470,11 +583,11 @@ layout (push_constant) uniform parameter
|
|||
} p;
|
||||
|
||||
void main() {
|
||||
for (int wgy = 0; wgy < 256; wgy++) {
|
||||
[[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
|
||||
const int i = int(gl_WorkGroupID.x * 256 + wgy);
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
const int tid = int(gl_LocalInvocationID.x);
|
||||
const int ip = tid / 32;
|
||||
const int il = tid - 32 * ip;
|
||||
|
@ -491,7 +604,7 @@ void main() {
|
|||
y[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 2] * (int8_t((x[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
|
||||
y[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 4] * (int8_t((x[i].ql[ql_idx + 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
|
||||
y[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(x[i].scales[is + 6] * (int8_t((x[i].ql[ql_idx + 32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
@ -553,6 +666,154 @@ void main() {
|
|||
}
|
||||
"""
|
||||
|
||||
# K-quants
|
||||
mul_mat_vec_q2_K_body = """
|
||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE x[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE y[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int ncols;
|
||||
} p;
|
||||
|
||||
shared FLOAT_TYPE tmp[32];
|
||||
|
||||
void main() {
|
||||
const int row = int(gl_WorkGroupID.x);
|
||||
|
||||
const int num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
|
||||
const int v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int v_in = tid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const int l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15
|
||||
const int q_offset = 32*v_im + l0;
|
||||
const int s_offset = 8*v_im;
|
||||
const int y_offset = 128*v_im + l0;
|
||||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
const int y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(x[ib0 + i].d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(x[ib0 + i].d.y);
|
||||
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
sum1 += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 0) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 0) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 2) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 2) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 4) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 4) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l + 0] >> 6) & 3)
|
||||
+ FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((x[ib0 + i].qs[q_offset + l +16] >> 6) & 3);
|
||||
sum2 += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 0] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 1] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 2] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 3] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 4] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 5] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 6] >> 4) & 0xF)
|
||||
+ FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE((x[ib0 + i].scales[s_offset + 7] >> 4) & 0xF);
|
||||
}
|
||||
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = 16; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = D_TYPE(tmp[0]);
|
||||
}
|
||||
}
|
||||
"""
|
||||
mul_mat_vec_q3_K_body = """
|
||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE x[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE y[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int ncols;
|
||||
} p;
|
||||
|
||||
shared FLOAT_TYPE tmp[32];
|
||||
|
||||
void main() {
|
||||
const int row = int(gl_WorkGroupID.x);
|
||||
|
||||
const int num_blocks_per_row = p.ncols / QUANT_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
|
||||
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||
|
||||
const int v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int v_in = tid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4 * v_im));
|
||||
|
||||
const int l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15
|
||||
const int q_offset = 32*v_im + l0;
|
||||
const int y_offset = 128*v_im + l0;
|
||||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
const uint s_shift = 4 * v_im;
|
||||
|
||||
[[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
const int y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(x[ib0 + i].d);
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
sum += FLOAT_TYPE(y[y_idx + l + 0]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[0] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] ) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 32]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[2] >> s_shift) & 0xF) | ((x[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 64]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[4] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 96]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[6] >> s_shift) & 0xF) | ((x[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((x[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 16]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[1] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] ) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 48]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[3] >> s_shift) & 0xF) | ((x[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l + 80]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[5] >> s_shift) & 0xF) | ((x[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4))
|
||||
+ FLOAT_TYPE(y[y_idx + l +112]) * FLOAT_TYPE(int8_t(((x[ib0 + i].scales[7] >> s_shift) & 0xF) | ((x[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((x[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((x[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4));
|
||||
}
|
||||
tmp[16 * ix + tid] += d * sum;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = 16; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = D_TYPE(tmp[0]);
|
||||
}
|
||||
}
|
||||
"""
|
||||
mul_mat_vec_q6_K_body = """
|
||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
|
@ -596,7 +857,7 @@ void main() {
|
|||
|
||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
[[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
const int y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(x[ib0 + i].d);
|
||||
|
@ -618,10 +879,10 @@ void main() {
|
|||
+ FLOAT_TYPE(y[y_idx + l+32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
|
||||
+ FLOAT_TYPE(y[y_idx + l+64]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((x[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32)
|
||||
+ FLOAT_TYPE(y[y_idx + l+96]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] >> 4) | (((x[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
}
|
||||
tmp[16 * ix + tid] += sum;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
|
@ -843,8 +1104,8 @@ async def string_to_spv_file(name, code, defines, fp16):
|
|||
preprocessed_code = stdout.decode()
|
||||
|
||||
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
|
||||
code_with_lines = "\n".join([f"{i}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
|
||||
print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error=}")
|
||||
code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
|
||||
print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}")
|
||||
f.close()
|
||||
os.remove(f.name)
|
||||
sys.exit(proc.returncode)
|
||||
|
@ -919,6 +1180,10 @@ async def main():
|
|||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||
else:
|
||||
|
@ -943,6 +1208,10 @@ async def main():
|
|||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, mul_mat_vec_q3_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, mul_mat_vec_q6_K_body))
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue