diff --git a/Makefile b/Makefile index 5523d9d04..a967e3c7c 100644 --- a/Makefile +++ b/Makefile @@ -215,12 +215,11 @@ endif # LLAMA_METAL ifdef LLAMA_VULKAN CFLAGS += -DGGML_USE_VULKAN - LDFLAGS += -lvulkan + LDFLAGS += -lvulkan -lopenblas -lcblas OBJS += ggml-vulkan.o ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-vulkan-matmul-shader: - glslc -fshader-stage=compute --target-env=vulkan1.2 -O ggml-vulkan-matmul.glsl -o ggml-vulkan-matmul.spv + glslc -fshader-stage=compute --target-env=vulkan1.2 -O ggml-vulkan-matmul.comp -o ggml-vulkan-matmul.spv endif ifneq ($(filter aarch64%,$(UNAME_M)),) @@ -287,7 +286,6 @@ clean: # # Examples # - main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo diff --git a/ggml-vulkan-matmul.comp b/ggml-vulkan-matmul.comp index 4a34e950e..4341a51ac 100644 --- a/ggml-vulkan-matmul.comp +++ b/ggml-vulkan-matmul.comp @@ -1,10 +1,14 @@ #version 450 -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; +#define BLOCKSIZE 32 -layout (binding = 0) readonly buffer A { float A_data[]; }; -layout (binding = 1) readonly buffer B { float B_data[]; }; -layout (binding = 2) writeonly buffer D { float D_data[]; }; +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = BLOCKSIZE * BLOCKSIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A { float data_a[]; }; +layout (binding = 1) readonly buffer B { float data_b[]; }; +layout (binding = 2) writeonly buffer D { float data_d[]; }; layout (push_constant) uniform parameter { @@ -16,18 +20,42 @@ layout (push_constant) uniform parameter int stride_d; } p; +shared float buf_a[(BLOCKSIZE+1) * BLOCKSIZE]; +shared float buf_b[(BLOCKSIZE+1) * BLOCKSIZE]; + void main() { - int i01 = int(gl_GlobalInvocationID.x); - int i11 = int(gl_GlobalInvocationID.y); + const int lr = int(gl_LocalInvocationID.x % BLOCKSIZE); + const int lc = int(gl_LocalInvocationID.x / BLOCKSIZE); - if (i01 < p.M && i11 < p.N) { - float sum = 0.0f; + const int ir = int(gl_WorkGroupID.x); + const int ic = int(gl_WorkGroupID.y); - for (int i = 0; i < p.K; i++) { - sum += A_data[i01 * p.stride_a + i] * B_data[i11 * p.stride_b + i]; + int pos_a = ir * BLOCKSIZE * p.stride_a; + int pos_b = ic * BLOCKSIZE * p.stride_b; + + float sum = 0.0f; + + [[unroll]] for (int i = 0; i < p.K; i += BLOCKSIZE) { + buf_a[lc * (BLOCKSIZE+1) + lr] = data_a[pos_a + lc * p.stride_a + lr]; + buf_b[lc * (BLOCKSIZE+1) + lr] = data_b[pos_b + lc * p.stride_b + lr]; + + barrier(); + + pos_a += BLOCKSIZE; + pos_b += BLOCKSIZE; + + [[unroll]] for (int j = 0; j < BLOCKSIZE; j++) { + sum += buf_a[lr * (BLOCKSIZE+1) + j] * buf_b[lc * (BLOCKSIZE+1) + j]; } - D_data[i11 * p.stride_d + i01] = sum; + barrier(); + } + + const int dr = ir * BLOCKSIZE + lr; + const int dc = ic * BLOCKSIZE + lc; + + if (dr < p.M && dc < p.N) { + data_d[dc * p.stride_d + dr] = sum; } } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f42254ecd..77de265dc 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1,5 +1,8 @@ #include "ggml-vulkan.h" +#include +#include + #include #define VMA_IMPLEMENTATION #if UINTPTR_MAX == 0xFFFFFFFF @@ -29,6 +32,7 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) { #include #include #include +#include #include "ggml.h" @@ -199,7 +203,7 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf) { }; VmaAllocationCreateInfo allocation_info = {}; - allocation_info.usage = VMA_MEMORY_USAGE_AUTO; + allocation_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; allocation_info.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; vmaCreateBuffer(vk_allocator, @@ -455,6 +459,8 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); // compute + auto begin = std::chrono::high_resolution_clock::now(); + vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); cmd_buffer.pushConstants(vk_pipeline_matmul_layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); @@ -480,10 +486,34 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr true, uint64_t(-1)); + auto end = std::chrono::high_resolution_clock::now(); + + std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; + // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - float * d_blas = (float *) malloc(sizeof(float) * d_ne); ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + +#ifdef false + const float * x = (float *) ((char *) src0->data); + const float * y = (float *) ((char *) src1->data); + float * d_chk = (float *) malloc(sizeof(float) * d_ne); + + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, + ne01, ne11, ne10, + 1.0f, x, ne00, + y, ne10, + 0.0f, d_chk, ne01); + + for (size_t i = 0; i < d_ne; i++) { + if (std::fabs(d[i] - d_chk[i]) > 0.01f) { + printf("d[%ld] = %f d_chk[%ld] = %f\n", i, d[i], i, d_chk[i]); + abort(); + } + } + + free(d_chk); +#endif } }