Remove unnecessary scalar layout extension
This commit is contained in:
parent
7e88677af4
commit
5ae5d2bd5b
2 changed files with 3 additions and 82 deletions
|
@ -171,7 +171,6 @@ v = v * d;
|
||||||
const std::string mulmat_head = R"(
|
const std::string mulmat_head = R"(
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
#extension GL_EXT_scalar_block_layout : require
|
|
||||||
#extension GL_EXT_control_flow_attributes : enable
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
|
||||||
|
@ -185,7 +184,7 @@ const std::string mulmat_head = R"(
|
||||||
const std::string mulmat_body = R"(
|
const std::string mulmat_body = R"(
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0, scalar) readonly buffer A { A_TYPE data_a[]; };
|
layout (binding = 0) readonly buffer A { A_TYPE data_a[]; };
|
||||||
layout (binding = 1) readonly buffer B { B_TYPE data_b[]; };
|
layout (binding = 1) readonly buffer B { B_TYPE data_b[]; };
|
||||||
layout (binding = 2) writeonly buffer D { D_TYPE data_d[]; };
|
layout (binding = 2) writeonly buffer D { D_TYPE data_d[]; };
|
||||||
|
|
||||||
|
@ -393,7 +392,6 @@ void main() {
|
||||||
const std::string dequant_head = R"(
|
const std::string dequant_head = R"(
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
#extension GL_EXT_scalar_block_layout : require
|
|
||||||
#extension GL_EXT_control_flow_attributes : require
|
#extension GL_EXT_control_flow_attributes : require
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||||
|
@ -402,7 +400,7 @@ const std::string dequant_head = R"(
|
||||||
const std::string dequant_body = R"(
|
const std::string dequant_body = R"(
|
||||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0, scalar) readonly buffer A { A_TYPE x[]; };
|
layout (binding = 0) readonly buffer A { A_TYPE x[]; };
|
||||||
layout (binding = 1) writeonly buffer D { D_TYPE y[]; };
|
layout (binding = 1) writeonly buffer D { D_TYPE y[]; };
|
||||||
|
|
||||||
layout (push_constant) uniform parameter
|
layout (push_constant) uniform parameter
|
||||||
|
@ -444,7 +442,6 @@ void main() {
|
||||||
const std::string mul_mat_vec_head = R"(
|
const std::string mul_mat_vec_head = R"(
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
#extension GL_EXT_scalar_block_layout : require
|
|
||||||
#extension GL_EXT_control_flow_attributes : enable
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
#extension GL_EXT_shader_8bit_storage : require
|
#extension GL_EXT_shader_8bit_storage : require
|
||||||
|
@ -453,7 +450,7 @@ const std::string mul_mat_vec_head = R"(
|
||||||
const std::string mul_mat_vec_body = R"(
|
const std::string mul_mat_vec_body = R"(
|
||||||
layout(local_size_x = QUANT_K, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = QUANT_K, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0, scalar) readonly buffer A { A_TYPE x[]; };
|
layout (binding = 0) readonly buffer A { A_TYPE x[]; };
|
||||||
layout (binding = 1) readonly buffer B { B_TYPE y[]; };
|
layout (binding = 1) readonly buffer B { B_TYPE y[]; };
|
||||||
layout (binding = 2) writeonly buffer D { D_TYPE dst[]; };
|
layout (binding = 2) writeonly buffer D { D_TYPE dst[]; };
|
||||||
|
|
||||||
|
|
|
@ -796,7 +796,6 @@ void ggml_vk_test_transfer(size_t ne);
|
||||||
void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||||
void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||||
void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align);
|
void ggml_vk_test_buffer_write_zeropad(size_t m, size_t k, size_t align);
|
||||||
void ggml_vk_test_f32_to_f16(size_t m, size_t k);
|
|
||||||
|
|
||||||
void ggml_vk_init(void) {
|
void ggml_vk_init(void) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
|
@ -953,11 +952,6 @@ void ggml_vk_init(void) {
|
||||||
ggml_vk_test_buffer_write_zeropad(233, 97, 1);
|
ggml_vk_test_buffer_write_zeropad(233, 97, 1);
|
||||||
ggml_vk_test_buffer_write_zeropad(256, 128, 1);
|
ggml_vk_test_buffer_write_zeropad(256, 128, 1);
|
||||||
|
|
||||||
ggml_vk_test_f32_to_f16(214, 256);
|
|
||||||
ggml_vk_test_f32_to_f16(256, 2048);
|
|
||||||
ggml_vk_test_f32_to_f16(24, 1000);
|
|
||||||
ggml_vk_test_f32_to_f16(24, 24);
|
|
||||||
|
|
||||||
int step = 16;
|
int step = 16;
|
||||||
for (size_t m = step; m < 64; m += step) {
|
for (size_t m = step; m < 64; m += step) {
|
||||||
ggml_vk_test_transfer(1024 * 1024 * m);
|
ggml_vk_test_transfer(1024 * 1024 * m);
|
||||||
|
@ -2640,76 +2634,6 @@ void ggml_vk_test_transfer(size_t ne) {
|
||||||
free(x);
|
free(x);
|
||||||
free(y);
|
free(y);
|
||||||
}
|
}
|
||||||
void ggml_vk_test_f32_to_f16(size_t m, size_t k) {
|
|
||||||
#ifdef VK_DEBUG
|
|
||||||
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
|
||||||
#endif
|
|
||||||
// Check transfers are correct
|
|
||||||
const uint32_t ne = m * k;
|
|
||||||
vk_buffer d_X = ggml_vk_create_buffer(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
||||||
vk_buffer d_Y = ggml_vk_create_buffer(sizeof(ggml_fp16_t) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
||||||
|
|
||||||
float* x = (float *) malloc(sizeof(float) * ne);
|
|
||||||
ggml_fp16_t* y = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * ne);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ne; i++) {
|
|
||||||
x[i] = rand() / (float)RAND_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_f32_to_f16, 1);
|
|
||||||
|
|
||||||
auto begin = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
ggml_vk_buffer_write(&d_X, 0, x, sizeof(float) * ne, vk_device.transfer_queues[0]);
|
|
||||||
|
|
||||||
vk_device.transfer_queues[0].queue.waitIdle();
|
|
||||||
|
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
|
||||||
|
|
||||||
begin = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
std::vector<vk_sequence> seqs;
|
|
||||||
vk_submission s = ggml_vk_begin_submission(vk_device.compute_queue);
|
|
||||||
const std::vector<int> pc = { (int)m, (int)k, (int)k, (int)k };
|
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_X, 0, (uint32_t)sizeof(float) * ne } }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_Y, 0, (uint32_t)sizeof(ggml_fp16_t) * ne} }, vk_device.compute_queue, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
|
||||||
ggml_vk_dispatch_pipeline(s, vk_pipeline_f32_to_f16, { { d_X, 0, (uint32_t)sizeof(float) * ne }, { d_Y, 0, (uint32_t)sizeof(ggml_fp16_t) * ne } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
|
||||||
ggml_vk_end_submission(s, {}, {});
|
|
||||||
seqs.push_back({ s });
|
|
||||||
|
|
||||||
ggml_vk_submit(vk_device.compute_queue, seqs, VK_NULL_HANDLE);
|
|
||||||
|
|
||||||
vk_device.compute_queue.queue.waitIdle();
|
|
||||||
|
|
||||||
end = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
double ms_convert = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
|
||||||
|
|
||||||
begin = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
ggml_vk_buffer_read(&d_Y, 0, y, sizeof(ggml_fp16_t) * ne, vk_device.transfer_queues[1]);
|
|
||||||
|
|
||||||
end = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
|
||||||
|
|
||||||
double avg_err = 0.0;
|
|
||||||
for (size_t i = 0; i < ne; i++) {
|
|
||||||
avg_err += std::fabs(x[i] - ggml_fp16_to_fp32(y[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << "TEST F32 TO F16 " << ms_to_gpu << "ms to_gpu " << ms_convert << "ms convert " << ms_from_gpu << "ms from gpu avg_err=" << avg_err / ne << std::endl;
|
|
||||||
|
|
||||||
ggml_vk_destroy_buffer(d_X);
|
|
||||||
ggml_vk_destroy_buffer(d_Y);
|
|
||||||
|
|
||||||
ggml_vk_pipeline_cleanup(vk_pipeline_f32_to_f16);
|
|
||||||
|
|
||||||
free(x);
|
|
||||||
free(y);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size) {
|
void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue