diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f4eba70ff..15c546498 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -71,7 +71,7 @@ vk::Device vk_device;
 vk::CommandPool vk_command_pool_compute, vk_command_pool_transfer;
 VmaAllocator vk_allocator;
 vk_pipeline vk_pipeline_matmul_f32, vk_pipeline_matmul_f16;
-vk_pipeline vk_pipeline_f16_to_f32;
+vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0;
 VmaAllocation vk_buffer_qa_alloc, vk_buffer_a_alloc, vk_buffer_b_alloc, vk_buffer_c_alloc;
 vk::Buffer vk_buffer_qa, vk_buffer_a, vk_buffer_b, vk_buffer_c;
 
@@ -332,6 +332,7 @@ void ggml_vk_init(void) {
     }
 
     vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {32, 1, 1});
+    vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {32, 1, 1});
 
     // Command pools
     vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index);
@@ -359,8 +360,8 @@ void ggml_vk_init(void) {
 
 static vk_pipeline* ggml_get_to_fp32_vk(ggml_type type) {
     switch (type) {
-        // case GGML_TYPE_Q4_0:
-        //     return &dequantize_row_q4_0_cl;
+        case GGML_TYPE_Q4_0:
+            return &vk_pipeline_dequant_q4_0;
         // case GGML_TYPE_Q4_1:
         //     return &dequantize_row_q4_1_cl;
         // case GGML_TYPE_Q5_0:
@@ -1022,7 +1023,7 @@ bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
     const int64_t ne1 = dst->ne[1];
 
     // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 /*|| ggml_is_quantized(src0->type)*/) &&
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
         dst->type == GGML_TYPE_F32 &&
         ((ne0 >= 128 && ne1 >= 32 && ne10 >= 128) || src0->backend == GGML_BACKEND_GPU)) {
diff --git a/ggml.c b/ggml.c
index 0d7bf38fa..c52717100 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11044,6 +11044,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
         }
         return;
     }
+#elif defined(GGML_USE_VULKAN)
+    if (ggml_vk_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_vk_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
diff --git a/vk_shaders/dequant_q4_0.glsl b/vk_shaders/dequant_q4_0.glsl
new file mode 100644
index 000000000..29d24b597
--- /dev/null
+++ b/vk_shaders/dequant_q4_0.glsl
@@ -0,0 +1,57 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+
+#define QUANT_K 32
+#define QUANT_R 2
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+struct block_q4_0
+{
+    float16_t d;
+    uint8_t qs[16];
+};
+
+layout (binding = 0) readonly buffer A { block_q4_0 x[]; };
+layout (binding = 1) writeonly buffer D { float y[]; };
+
+layout (push_constant) uniform parameter
+{
+    int N;
+} p;
+
+void main() {
+    const int idx = int(gl_GlobalInvocationID.x);
+
+    const int i = int(gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x*2);
+
+    if (idx >= p.N) {
+        return;
+    }
+
+    const int qk = QUANT_K;
+    const int qr = QUANT_R;
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    float v0, v1;
+    const float d = float(x[ib].d);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = int8_t(vui & 0xF);
+    const int8_t vi1 = int8_t(vui >> 4);
+
+    v0 = (vi0 - 8)*d;
+    v1 = (vi1 - 8)*d;
+
+    y[iybs + iqs + 0] = v0;
+    y[iybs + iqs + y_offset] = v1;
+}