Merge branch 'master' into compilade/bitnet-ternary

2024-08-11 15:52:29 -04:00 · 2024-08-11 15:52:29 -04:00 · d911cd1f13
commit d911cd1f13
parent 96b3d411e0 4134999e01
138 changed files with 7065 additions and 1937 deletions
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -207,6 +207,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
+    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml.h
    include/ggml-kompute.h
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void

 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

+GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

 // helper to check if the device supports a specific family
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -349,6 +349,7 @@ extern "C" {
    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);

    struct ggml_object;
@ -1141,16 +1142,17 @@ extern "C" {

    // group normalize along ne0*ne1*n_groups
    // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
    GGML_API struct ggml_tensor * ggml_group_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);

    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);

    // a - x
    // b - dy
@ -1457,7 +1459,6 @@ extern "C" {
    // if mode & 2 == 1, GPT-NeoX style
    //
    // b is an int32 vector with size a->ne[2], it contains the positions
-    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1474,6 +1475,7 @@ extern "C" {
            int                   mode);

    // custom RoPE
+    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -849,11 +849,6 @@ if (GGML_CANN)
                ${CANN_INSTALL_DIR}/acllib/include
            )

-            # TODO: find libs
-            link_directories(
-                ${CANN_INSTALL_DIR}/lib64
-            )
-
            add_subdirectory(ggml-cann/kernels)
            list(APPEND CANN_LIBRARIES
                ascendcl
@ -872,6 +867,7 @@ if (GGML_CANN)

            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
+            set(GGML_EXTRA_LIBDIRS  ${GGML_EXTRA_LIBDIRS}  ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -16,6 +16,8 @@

 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
+#elif defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
 #endif

 #define UNUSED GGML_UNUSED
@ -384,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -496,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -614,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -680,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
@ -745,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1266,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1728,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -2139,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
    }

    // an async copy would normally happen after all the queued operations on both backends are completed
-    // sync src, set_async dst
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_synchronize(backend_src);
-        ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
-    } else {
-        ggml_backend_synchronize(backend_src);
-        ggml_backend_tensor_copy(src, dst);
-        ggml_backend_synchronize(backend_dst);
-    }
+    // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
+    ggml_backend_synchronize(backend_src);
+    ggml_backend_synchronize(backend_dst);
+    ggml_backend_tensor_copy(src, dst);
 }

 // events
@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                } else {
                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
+                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    ggml_backend_synchronize(input_backend);
+                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                        ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                    } else {
+                        ggml_backend_synchronize(split_backend);
+                    }
+                    ggml_backend_tensor_copy(input, input_cpy);
+                }
            }
        }

--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
 GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
                                                       const void* src,
                                                       void* dst) {
-    GGML_ASSERT(tensor->op == GGML_OP_NONE);

    int64_t n_elems = ggml_nelements(tensor);
    int64_t groups = n_elems / QK4_0;
@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
 */
 GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
    const ggml_tensor* tensor, void* src, void* dst) {
-    GGML_ASSERT(tensor->op == GGML_OP_NONE);

    int64_t n_elems = ggml_nelements(tensor);
    int64_t groups = n_elems / QK4_0;
@ -898,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
 * @param size Size of the data to be copied, in bytes.
 */
 GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
-    ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
+    ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
    size_t offset, size_t size) {
-    // GGML_ASSERT(size == ggml_nbytes(tensor));
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+    ggml_backend_cann_buffer_context *ctx =
+        (ggml_backend_cann_buffer_context *)buffer->context;

    ggml_cann_set_device(ctx->device);
    // TODO: refer to cann(#6017), it use thread's default stream.
@ -910,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
    // Why aclrtSynchronizeDevice?

    if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
-                              size, ACL_MEMCPY_HOST_TO_DEVICE));
+        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
    } else {
-        void* transform_buffer = malloc(size);
-        ggml_backend_cann_transform(tensor, (const char*)data + offset,
-                                    transform_buffer);
+        void *transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, data, transform_buffer);

 #ifndef NDEBUG
-        void* check_buffer = malloc(size);
+        void *check_buffer = malloc(size);
        ggml_backend_cann_transform_back(tensor, transform_buffer,
                                         check_buffer);
-        GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
-                    0);
+        GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
        free(check_buffer);
 #endif
-        ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
+        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
+                              transform_buffer, size,
                              ACL_MEMCPY_HOST_TO_DEVICE));
        free(transform_buffer);
    }
@ -947,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
 GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
    ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
    size_t offset, size_t size) {
-    GGML_ASSERT(size == ggml_nbytes(tensor));
    ggml_backend_cann_buffer_context* ctx =
        (ggml_backend_cann_buffer_context*)buffer->context;

    ggml_cann_set_device(ctx->device);

    if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
+        ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
                              ACL_MEMCPY_DEVICE_TO_HOST));
    } else {
        void* transform_buffer = malloc(size);
-        ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
+        ACL_CHECK(aclrtMemcpy(transform_buffer, size,
+                              (char*)tensor->data + offset, size,
                              ACL_MEMCPY_DEVICE_TO_HOST));
-        ggml_backend_cann_transform_back(tensor, transform_buffer,
-                                         (char*)data + offset);
+        ggml_backend_cann_transform_back(tensor, transform_buffer, data);
        free(transform_buffer);
    }
 }
@ -1450,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
 * @param size Size of the data to copy in bytes.
 */
 GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
-                                                         ggml_tensor* tensor,
-                                                         const void* data,
+                                                         ggml_tensor *tensor,
+                                                         const void *data,
                                                         size_t offset,
                                                         size_t size) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+    ggml_backend_cann_context *cann_ctx =
+        (ggml_backend_cann_context *)backend->context;

    if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpyAsync(
-            tensor->data, size, (const char*)data + offset, size,
-            ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
+        ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
+                                   size, ACL_MEMCPY_HOST_TO_DEVICE,
+                                   cann_ctx->stream()));
    } else {
-        void* transform_buffer = malloc(size);
-        ggml_backend_cann_transform(tensor, (const char*)data + offset,
-                                    transform_buffer);
+        void *transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, data, transform_buffer);

 #ifndef NDEBUG
-        void* check_buffer = malloc(size);
+        void *check_buffer = malloc(size);
        ggml_backend_cann_transform_back(tensor, transform_buffer,
                                         check_buffer);
-        GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
+        GGML_ASSERT(memcmp(data, check_buffer, size));
        free(check_buffer);
 #endif
-        ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
-                                   ACL_MEMCPY_HOST_TO_DEVICE,
-                                   cann_ctx->stream()));
+        ACL_CHECK(aclrtMemcpyAsync(
+            (char *)tensor->data + offset, size, transform_buffer, size,
+            ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
        free(transform_buffer);
    }
 }

 GGML_CALL static void ggml_backend_cann_get_tensor_async(
-    ggml_backend_t backend, const ggml_tensor* tensor, void* data,
+    ggml_backend_t backend, const ggml_tensor *tensor, void *data,
    size_t offset, size_t size) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+    ggml_backend_cann_context *cann_ctx =
+        (ggml_backend_cann_context *)backend->context;
    ggml_backend_buffer_t buf =
        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

@ -1493,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
                "unsupported buffer type");

    if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
+        ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
                                   size, ACL_MEMCPY_DEVICE_TO_HOST,
                                   cann_ctx->stream()));
    } else {
-        void* transform_buffer = malloc(size);
-        ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
-                                   ACL_MEMCPY_DEVICE_TO_HOST,
-                                   cann_ctx->stream()));
+        void *transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpyAsync(
+            transform_buffer, size, (char *)tensor->data + offset, size,
+            ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
-        ggml_backend_cann_transform_back(tensor, transform_buffer,
-                                         (char*)data + offset);
+        ggml_backend_cann_transform_back(tensor, transform_buffer, data);
        free(transform_buffer);
    }
 }
@ -1666,10 +1659,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
-                // case GGML_TYPE_Q4_0:
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
                case GGML_TYPE_Q8_0:
+                    // TODO: fix me
+                    // Current groupsize should not be greater than k-1 in
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
+                case GGML_TYPE_Q4_0:
                    return true;
                default:
                    return false;
@ -1694,6 +1690,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
                    return true;
                default:
                    return false;
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
            return ACL_INT16;
        case GGML_TYPE_I32:
            return ACL_INT32;
+        case GGML_TYPE_Q4_0:
+            return ACL_INT4;
+        case GGML_TYPE_Q8_0:
+            return ACL_INT8;
        default:
            return ACL_DT_UNDEFINED;
    }
@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
    return false;
 }

-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
-                                   size_t type_size, int64_t* ne, size_t* nb,
-                                   int64_t dims, aclFormat format,
-                                   size_t offset) {
-    int64_t tmp_ne[GGML_MAX_DIMS * 2];
-    int64_t tmp_stride[GGML_MAX_DIMS * 2];
-
-    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
-    for (int i = 0; i < dims; i++) {
-        tmp_stride[i] = nb[i] / type_size;
-    }
-
-    std::reverse(tmp_ne, tmp_ne + dims);
-    std::reverse(tmp_stride, tmp_stride + dims);
-
-    int64_t acl_storage_len = 0;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (ne[i] - 1) * nb[i];
-    }
-
-    aclTensor* acl_tensor =
-        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
-                        format, &acl_storage_len, 1, data_ptr);
-
-    return acl_tensor;
-}
-
 int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
                                  const ggml_tensor* src1,
                                  int64_t* bcast_src0_ne,
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@ -23,6 +23,9 @@
 #ifndef CANN_ACL_TENSOR_H
 #define CANN_ACL_TENSOR_H

+#include <algorithm>
+#include <cstring>
+
 #include <aclnn/aclnn_base.h>
 #include "common.h"

@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
                             size_t offset = 0);

 /**
- * @brief   Creates an ACL tensor from provided parameters.
+ * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
+ *          should be size_t or float.
 *
 * @details This function creates an ACL tensor using the provided data pointer,
 *          data type, dimensions, strides, format, offset, and additional parameters.
@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
+template<typename TYPE>
 aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
-                             size_t type_size, int64_t* ne, size_t* nb,
-                             int64_t dims, aclFormat format = ACL_FORMAT_ND,
-                             size_t offset = 0);
+                                   TYPE type_size, int64_t* ne, TYPE* nb,
+                                   int64_t dims,
+                                   aclFormat format = ACL_FORMAT_ND,
+                                   size_t offset = 0) {
+    int64_t tmp_ne[GGML_MAX_DIMS * 2];
+    int64_t tmp_stride[GGML_MAX_DIMS * 2];
+
+    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
+    for (int i = 0; i < dims; i++) {
+        tmp_stride[i] = nb[i] / type_size;
+    }
+
+    std::reverse(tmp_ne, tmp_ne + dims);
+    std::reverse(tmp_stride, tmp_stride + dims);
+
+    int64_t acl_storage_len = 0;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (ne[i] - 1) * nb[i];
+    }
+
+    aclTensor* acl_tensor =
+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
+                        format, &acl_storage_len, 1, data_ptr);
+
+    return acl_tensor;
+}

 /**
 * @brief   Checks if tensors require broadcasting based on their shapes.
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

-    const float eps = 1e-6f;  // TODO: make this a parameter
    int n_groups = dst->op_params[0];

+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                ((ggml_tensor*)dst->extra)->ne);
            return;
        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
        if (dst->type == GGML_TYPE_F16) {
            if (ggml_are_same_shape(src, dst)) {
                cann_copy(ctx, acl_src, acl_dst);
@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                ((ggml_tensor*)dst->extra)->ne);
            return;
        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
        if (dst->type == GGML_TYPE_F32) {
            if (ggml_are_same_shape(src, dst)) {
                cann_copy(ctx, acl_src, acl_dst);
@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
 #ifdef __cplusplus
 }
 #endif
+
+static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
+                                             ggml_tensor* dst,
+                                             ggml_tensor* src1,
+                                             aclTensor* tmp_cast_tensor,
+                                             aclTensor* tmp_im2col_tensor) {
+    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+    int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
+    size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+
+    int64_t permute_dim[] = {0, 2, 1};
+    if (src1->type != dst->type) {
+        aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
+    } else {
+        aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void ggml_cann_im2col_1d_post_process(
+    ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
+    aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
+    const std::vector<int64_t>& im2col_op_params) {
+    // get params
+    const int64_t KH = im2col_op_params[0];
+    const int64_t KW = im2col_op_params[1];
+    const int64_t IW = im2col_op_params[2];
+    const int64_t IC = im2col_op_params[3];
+    const int64_t N = im2col_op_params[4];
+    const int64_t OH = im2col_op_params[5];
+    const int64_t OW = im2col_op_params[6];
+    const int64_t s0 = im2col_op_params[7];
+    const int64_t p0 = im2col_op_params[8];
+    const int64_t d0 = im2col_op_params[9];
+    const int64_t n_bytes_factor = im2col_op_params[10];
+
+    // Permute: [N, IC * KH * KW, OW * OH] ->
+    // [N, OW * OH * n_bytes_factor, IC * KH * KW]
+    aclTensor* tmp_permute_tensor = nullptr;
+    ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
+    tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+    void* tmp_permute_buffer = tmp_permute_allocator.get();
+
+    int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
+    size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
+    tmp_permute_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+    }
+
+    tmp_permute_tensor = ggml_cann_create_tensor(
+        tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
+        GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+    int64_t permute_dim[] = {0, 2, 1};
+    if (src1->type != dst->type) {
+        aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
+    } else {
+        aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
+                      3);
+    }
+
+    // number of times the kernel moves in W dimension
+    const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
+    size_t offset;
+    void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
+
+    // memory copy with offset to restore 1D im2col from 2d
+    if (IC > 1) {
+        offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
+        size_t size_cpy = KH * KW * ggml_type_size(dst->type);
+
+        for (int c = 0; c < IC; c++) {
+            cur_permute_buffer = (char*)tmp_permute_buffer + offset +
+                                 KH * KW * c * ggml_type_size(dst->type);
+            cur_dst_buffer = (char*)dst->data +
+                             c * KH * KW * n_step_w * ggml_type_size(dst->type);
+
+            for (int i = 0; i < n_step_w; i++) {
+                ACL_CHECK(aclrtMemcpyAsync(
+                    cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                cur_dst_buffer =
+                    (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
+                cur_permute_buffer = (char*)cur_permute_buffer +
+                                     KH * KW * IC * ggml_type_size(dst->type);
+            }
+        }
+    } else {
+        offset = KH * KW * n_step_w *
+                 ggml_type_size(dst->type);  // equal to ggml_nbytes(dst)
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
+                                   (char*)tmp_permute_buffer + offset, offset,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
+}
+
 void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];  // kernel
    ggml_tensor* src1 = dst->src[1];  // input
@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
    GGML_TENSOR_BINARY_OP_LOCALS;

-    const int64_t N = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
+    // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
+    // im2col and do post-processing to restore it to 1D.
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;

-    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t N = ne13;
+    const int64_t IC = ne12;
+    const int64_t KH = ne01;
    const int64_t KW = ne00;
+    const int64_t IW = ne10;

    const int64_t OH = is_2D ? ne2 : 1;
    const int64_t OW = ne1;
@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));

-    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
+    // memory allocated increased to 3x when is_2D == false
+    const int64_t n_bytes_factor = is_2D ? 1 : 3;
+
+    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
-    int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
+    int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
    size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];

    tmp_im2col_nb[0] = ggml_type_size(src1->type);
@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
    // dst.elemcount.
    ggml_cann_pool_alloc im2col_allocator(
-        ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
+        ctx.pool(),
+        ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
    void* tmp_im2col_buffer = im2col_allocator.get();
+
    aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
        tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
        ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                                          paddings, strides, tmp_im2col_tensor,
                                          &workspaceSize, &executor));

+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspace_allocator.alloc(workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }

@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // Cast if dst is f16.
    aclTensor* tmp_cast_tensor = nullptr;
    ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
+    void* tmp_cast_buffer = nullptr;
    if (src1->type != dst->type) {
-        tmp_cast_allocator.alloc(ggml_nbytes(dst));
-        void* tmp_cast_buffer = tmp_cast_allocator.get();
+        tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+        tmp_cast_buffer = tmp_cast_allocator.get();
        size_t temp_cast_nb[GGML_MAX_DIMS - 1];
        temp_cast_nb[0] = ggml_type_size(dst->type);
        for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                   ggml_cann_type_mapping(dst->type));
    }

-    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
-    int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
-    size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
-    aclTensor* acl_dst =
-        ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
-
-    int64_t permute_dim[] = {0, 2, 1};
-    if (src1->type != dst->type) {
-        aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
+    // post-processing
+    if (is_2D) {
+        ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
+                                         tmp_im2col_tensor);
    } else {
-        aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
+        std::vector<int64_t> im2col_op_params = {
+            KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
+        ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
+                                         tmp_im2col_tensor, im2col_op_params);
    }

    // release
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
    ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
    ACL_CHECK(aclDestroyIntArray(kernel_size));
    ACL_CHECK(aclDestroyIntArray(dilations));
    ACL_CHECK(aclDestroyIntArray(paddings));
@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
 * @param dst The destination tensor where the result of the matrix
 * multiplication will be stored.
 */
-static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
-                                   ggml_tensor* dst) {
+static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
+                                   ggml_tensor* dst,
+                                   const enum ggml_type type) {
    ggml_tensor* src0 = dst->src[0];  // weight
    ggml_tensor* src1 = dst->src[1];  // input

    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
    // is regarded as batch. weight need transpose.
    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
-    size_t weight_elem_size = sizeof(uint8_t);
-    size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+    float weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    }
+    else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    }
+    else {
+        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+    }
+    float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+
    // size of one matrix is element_size * height * width.
    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];

    // scale stored at the end of weight. Also need transpose.
+    GGML_ASSERT(QK4_0 == QK8_0);
    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
    size_t scale_elem_size = sizeof(uint16_t);
    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];

+    ggml_cann_pool_alloc input_alloctor(ctx.pool());
    if (src1->type != GGML_TYPE_F16) {
        aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        ggml_cann_pool_alloc input_alloctor(
-            ctx.pool(), ggml_nelements(src1) * input_elem_size);
+        input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
        input_buffer = input_alloctor.get();

        int64_t* input_cast_ne = src1->ne;
@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
                input_elem_size, input_ne, input_nb, 2);
            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
-                (char*)src0->data + batch0 * weight_stride, ACL_INT8,
-                weight_elem_size, weight_ne, weight_nb, 2);
+                (char*)src0->data + batch0 * weight_stride,
+                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                weight_nb, 2);
            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
                scale_elem_size, scale_ne, scale_nb, 2);
@ -2485,11 +2625,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
        case GGML_TYPE_F16:
            ggml_cann_mat_mul_fp(ctx, dst);
            break;
-        // case GGML_TYPE_Q4_0:
-        //     ggml_cann_mul_mat_q4_0(ctx, dst);
-        //     break;
+        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
-            ggml_cann_mul_mat_q8_0(ctx, dst);
+            ggml_cann_mul_mat_quant(ctx, dst, type);
            break;
        default:
            GGML_ABORT("fatal error");
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@ -9,6 +9,7 @@ file(GLOB SRC_FILES
    get_row_q8_0.cpp
    quantize_f32_q8_0.cpp
    quantize_f16_q8_0.cpp
+    quantize_float_to_q4_0.cpp
    dup.cpp
 )

@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
    ${SRC_FILES}
 )

-#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
+# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@ -8,6 +8,8 @@

 #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
 #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
+#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"

 #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@ -0,0 +1,278 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+#define Group_Size 32
+
+template <typename SRC_T>
+class QUANTIZE_FLOAT_TO_Q4_0 {
+   public:
+    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+        //                         permute=[0,0,0,0]):
+        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // input stride of data elements
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        // output stride of data elements
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / Group_Size;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t) / 2;
+
+        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
+                                                 group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
+        pipe.InitBuffer(output_queue, BUFFER_NUM,
+                            Group_Size * sizeof(int8_t) / 2);
+        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
+        DataCopy(input_local, input_gm[offset], Group_Size);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
+        // and using DataCopyPad to avoid 32 bits align.
+        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
+        LocalTensor<int8_t> output_int8_local =
+                                    output_local.ReinterpretCast<int8_t>();
+
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
+        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
+
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<float> input_local) {
+        DataCopy(cast_local, input_local, Group_Size);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<half> input_local) {
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + Group_Size * group;
+
+        // output_offset is stride for output_gm which datatype is int8_t and
+        // divided by 2 is needed for int4b_t.
+        const int64_t output_offset = (i1 * output_stride[1] +
+                                       i2 * output_stride[2] +
+                                       i3 * output_stride[3] +
+                                       Group_Size * group) / 2;
+        copy_in(input_offset);
+
+        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
+        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
+        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
+        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
+
+        input_to_cast(cast_local, input_local);
+
+        ReduceMax(max_local, cast_local, work_local, Group_Size);
+        ReduceMin(min_local, cast_local, work_local, Group_Size);
+        const float max_value = max_local.GetValue(0);
+        const float min_value = min_local.GetValue(0);
+        float d = max_value;
+        if (min_value < 0 && (-1 * min_value) > max_value) {
+            d = min_value;
+        }
+
+        d = d / (-8);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
+        }
+
+        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
+        float scalar = 8.5f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
+        scalar = 15.0f;
+        Mins(cast_local, cast_local, scalar, Group_Size);
+        scalar = -8.0f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+
+        // float->half->int4b
+        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
+        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
+
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        max_queue.FreeTensor(max_local);
+        min_queue.FreeTensor(min_local);
+        int8_queue.FreeTensor(int8_local);
+        half_queue.FreeTensor(half_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                // Copy Group_Size/2 length data each time.
+                if (scale_local_offset == Group_Size / 2) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local,
+                                      Group_Size / 2);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += Group_Size / 2;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+        scale_queue.FreeTensor(scale_local);
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<SRC_T> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<half> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<float> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
    }
    return res;
 #else
+
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+    cudaError_t err;
+    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
+    {
+        err = cudaMallocManaged(ptr, size);
+    }
+    else
+    {
+        err = cudaMalloc(ptr, size);
+    }
+    return err;
+#else
    return cudaMalloc(ptr, size);
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
 #endif
 }

@ -1486,7 +1501,7 @@ static void ggml_cuda_op_mul_mat(
        }

        // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
-        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
            const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
            const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
@ -1885,10 +1900,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);

-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
+    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
-        && src1->ne[1] == 1;
+        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
@ -2344,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
 }

 GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
-
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

-    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }

-    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
+    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
        return false;
    }

-    // device -> device
+    // device -> device copy
    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;

+    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
+#ifndef NDEBUG
+        GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
+#endif
+        return false;
+    }
+
    if (backend_src != backend_dst) {
-        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
-        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
-
-        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
-        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
-
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
-            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
        } else {
 #ifdef GGML_CUDA_NO_PEER_COPY
            return false;
@ -2379,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 #endif
        }

-        // record event on src stream
+        // record event on src stream after the copy
        if (!cuda_ctx_src->copy_event) {
            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
@ -2391,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
    } else {
        // src and dst are on the same backend
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
    }
    return true;
 }
@ -2728,11 +2744,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_MUL_MAT_ID:
            {
                struct ggml_tensor * a = op->src[0];
-                if (op->op == GGML_OP_MUL_MAT) {
-                    struct ggml_tensor * b = op->src[1];
-                    if (a->ne[3] != b->ne[3]) {
-                        return false;
-                    }
+                struct ggml_tensor * b = op->src[1];
+                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+                    return false;
+                }
+                if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
+                    return false;
                }
                switch (a->type) {
                    case GGML_TYPE_F32:
@ -2863,7 +2880,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
+            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
 #else
            if (op->src[0]->ne[0] == 128) {
                return true;
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -27,255 +27,11 @@
 #include <vector>

 #if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasDestroy hipblasDestroy
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEventSynchronize hipEventSynchronize
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaHostRegister hipHostRegister
-#define cudaHostRegisterPortable hipHostRegisterPortable
-#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
-#define cudaHostUnregister hipHostUnregister
-#define cudaLaunchHostFunc hipLaunchHostFunc
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamDestroy hipStreamDestroy
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamPerThread hipStreamPerThread
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap() do { abort(); __builtin_unreachable(); } while(0)
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#include "vendors/hip.h"
 #elif defined(GGML_USE_MUSA)
-#include <musa_runtime.h>
-#include <musa.h>
-#include <mublas.h>
-#include <musa_fp16.h>
-// XXX: Keep the following order the same as hipBLAS
-// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
-// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
-#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N MUBLAS_OP_N
-#define CUBLAS_OP_T MUBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
-// #define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  MUSA_R_16F
-#define CUDA_R_32F  MUSA_R_32F
-// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-// #define cublasComputeType_t mublasComputeType_t
-#define cublasCreate mublasCreate
-#define cublasDestroy mublasDestroy
-#define cublasGemmEx mublasGemmEx
-#define cublasGemmBatchedEx mublasGemmBatchedEx
-#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
-#define cublasHandle_t mublasHandle_t
-// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetMathMode mublasSetMathMode
-#define cublasSetStream mublasSetStream
-#define cublasSgemm mublasSgemm
-#define cublasStatus_t mublasStatus_t
-#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
-#define cudaDeviceProp musaDeviceProp
-#define cudaDeviceSynchronize musaDeviceSynchronize
-#define cudaError_t musaError_t
-#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags musaEventCreateWithFlags
-#define cudaEventDisableTiming musaEventDisableTiming
-#define cudaEventRecord musaEventRecord
-#define cudaEventSynchronize musaEventSynchronize
-#define cudaEvent_t musaEvent_t
-#define cudaEventDestroy musaEventDestroy
-#define cudaFree musaFree
-#define cudaFreeHost musaFreeHost
-#define cudaGetDevice musaGetDevice
-#define cudaGetDeviceCount musaGetDeviceCount
-#define cudaGetDeviceProperties musaGetDeviceProperties
-#define cudaGetErrorString musaGetErrorString
-#define cudaGetLastError musaGetLastError
-#define cudaHostRegister musaHostRegister
-#define cudaHostRegisterPortable musaHostRegisterPortable
-#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
-#define cudaHostUnregister musaHostUnregister
-#define cudaLaunchHostFunc musaLaunchHostFunc
-#define cudaMalloc musaMalloc
-#define cudaMallocHost musaMallocHost
-#define cudaMemcpy musaMemcpy
-#define cudaMemcpyAsync musaMemcpyAsync
-#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
-#define cudaMemcpy2DAsync musaMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
-#define cudaMemcpyKind musaMemcpyKind
-#define cudaMemset musaMemset
-#define cudaMemsetAsync musaMemsetAsync
-#define cudaMemGetInfo musaMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
-#define cudaSetDevice musaSetDevice
-#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
-#define cudaStreamDestroy musaStreamDestroy
-#define cudaStreamFireAndForget musaStreamFireAndForget
-#define cudaStreamNonBlocking musaStreamNonBlocking
-#define cudaStreamPerThread musaStreamPerThread
-#define cudaStreamSynchronize musaStreamSynchronize
-#define cudaStreamWaitEvent musaStreamWaitEvent
-#define cudaStream_t musaStream_t
-#define cudaSuccess musaSuccess
-
-// XXX: Other CUDA => MUSA mapping
-#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
-#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
-#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
-#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
-#define CUdevice MUdevice
-#define CUdeviceptr MUdeviceptr
-#define CUmemAccessDesc MUmemAccessDesc
-#define CUmemAllocationProp MUmemAllocationProp
-#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
-#define cuDeviceGet muDeviceGet
-#define cuDeviceGetAttribute muDeviceGetAttribute
-#define cuMemAddressFree muMemAddressFree
-#define cuMemAddressReserve muMemAddressReserve
-#define cuMemCreate muMemCreate
-#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
-#define cuMemMap muMemMap
-#define cuMemRelease muMemRelease
-#define cuMemSetAccess muMemSetAccess
-#define cuMemUnmap muMemUnmap
-#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
-#define cudaFuncSetAttribute musaFuncSetAttribute
-#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
-#define make_cudaExtent make_musaExtent
-#define make_cudaPitchedPtr make_musaPitchedPtr
-
-// XXX: USE_CUDA_GRAPH
-#define CUDA_SUCCESS MUSA_SUCCESS
-#define CUresult MUresult
-#define cuGetErrorString muGetErrorString
-#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
-#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
-#define cudaGraphDestroy musaGraphDestroy
-#define cudaGraphExecDestroy musaGraphExecDestroy
-#define cudaGraphExec_t musaGraphExec_t
-#define cudaGraphExecUpdate musaGraphExecUpdate
-#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
-#define cudaGraphGetNodes musaGraphGetNodes
-#define cudaGraphInstantiate musaGraphInstantiate
-#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
-#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
-#define cudaGraphLaunch musaGraphLaunch
-#define cudaGraphNodeGetType musaGraphNodeGetType
-#define cudaGraphNode_t musaGraphNode_t
-#define cudaGraphNodeType musaGraphNodeType
-#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
-#define cudaGraph_t musaGraph_t
-#define cudaKernelNodeParams musaKernelNodeParams
-#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
-#define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: cuBLAS => muBLAS mapping
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
+#include "vendors/musa.h"
 #else
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
+#include "vendors/cuda.h"
 #endif // defined(GGML_USE_HIPBLAS)

 #define STRINGIZE_IMPL(...) #__VA_ARGS__
@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in

 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
    static const char * cublas_get_error_str(const cublasStatus_t err) {
-#ifndef GGML_USE_MUSA
        return cublasGetStatusString(err);
-#else
-        return mublasStatus_to_string(err);
-#endif // GGML_USE_MUSA
    }
 #else
    static const char * cublas_get_error_str(const cublasStatus_t err) {
@ -364,129 +116,7 @@ typedef half2 dfloat2;
 #else
 typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-#if defined(GGML_USE_MUSA)
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
-#endif // defined(GGML_USE_MUSA)
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#if defined(__gfx1010__) || defined(__gfx1012__)
-#define RDNA1
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int &>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __vsub4(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
-
-#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-// __shfl_xor() for half2 was added in ROCm 5.6
-static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
-    typedef union half2_b32 {
-        half2 val;
-        int   b32;
-    } half2_b32_t;
-    half2_b32_t tmp;
-    tmp.val = var;
-    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
-    return tmp.val;
-}
-#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-#endif // defined(GGML_USE_HIPBLAS)
+#endif // GGML_CUDA_F16

 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_AVAILABLE
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
 }

 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
    const dim3 block_nums(block_num_y, 1, 1);
@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
 }

 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
 }

 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
 }

 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
 }

 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 }

 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
    GGML_UNUSED(src1_ncols);
    GGML_UNUSED(src1_padded_row_size);
 }
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
+    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
+        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
+        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
+        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
+        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
+        src0_type == GGML_TYPE_F16;
+}
--- a/ggml/src/ggml-cuda/dmmv.cuh
+++ b/ggml/src/ggml-cuda/dmmv.cuh
@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
    }
 }

-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
-    static const float eps = 1e-6f;
+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
    if (group_size < 1024) {
        const dim3 block_dims(WARP_SIZE, 1, 1);
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int num_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
+    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
 }

 void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -0,0 +1,177 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int &>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
+
+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
+// __shfl_xor() for half2 was added in ROCm 5.6
+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
+    typedef union half2_b32 {
+        half2 val;
+        int   b32;
+    } half2_b32_t;
+    half2_b32_t tmp;
+    tmp.val = var;
+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
+    return tmp.val;
+}
+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -0,0 +1,171 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+// XXX: Clang builtins mapping
+#define __vsub4   __vsub4_musa
+#define __vcmpeq4 __vcmpeq4_musa
+#define __vcmpne4 __vcmpne4_musa
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+
+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
 /**
 * Converts float32 to brain16.
 *
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
- * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
 * This code should vectorize nicely if using modern compilers.
 */
 static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
        h.bits = (u.i >> 16) | 64; /* force to quiet */
        return h;
    }
-    if (!(u.i & 0x7f800000)) { /* subnormal */
-        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
-        return h;
-    }
    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
    return h;
 }
@ -146,6 +143,7 @@ extern "C" {

 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
+#include <sys/prctl.h>
 #endif

 // 16-bit float
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -210,7 +210,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_COUNT
 };

-struct ggml_metal_context {
+struct ggml_backend_metal_context {
    int n_cb;

    id<MTLDevice>       device;
@ -224,6 +224,10 @@ struct ggml_metal_context {
    bool support_simdgroup_mm;

    bool should_capture_next_compute;
+
+    // abort ggml_metal_graph_compute if callback returns true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };

 // MSL code
@ -289,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) {
    return data;
 }

-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@ -306,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

    // Configure context
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
    ctx->device = device;
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
@ -668,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    return ctx;
 }

-static void ggml_metal_free(struct ggml_metal_context * ctx) {
+static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);

    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@ -734,7 +738,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
    return nil;
 }

-static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
+static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
    for (size_t i = 0, n = 3; i < n; ++i) {
        if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
            return false;
@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
 }

 static enum ggml_status ggml_metal_graph_compute(
-        struct ggml_metal_context * ctx,
+        struct ggml_backend_metal_context * ctx,
               struct ggml_cgraph * gf) {

    @autoreleasepool {
@ -878,8 +882,11 @@ static enum ggml_status ggml_metal_graph_compute(
        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
        command_buffer_builder[cb_idx] = command_buffer;

-        // enqueue the command buffers in order to specify their execution order
-        [command_buffer enqueue];
+        // always enqueue the first two command buffers
+        // enqueue all of the command buffers if we don't need to abort
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer enqueue];
+        }
    }

    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@ -2229,10 +2236,8 @@ static enum ggml_status ggml_metal_graph_compute(
                        GGML_ASSERT(ne00 % 4 == 0);
                        GGML_ASSERT(ggml_is_contiguous(src0));

-                        //float eps;
-                        //memcpy(&eps, dst->op_params, sizeof(float));
-
-                        const float eps = 1e-6f; // TODO: temporarily hardcoded
+                        float eps;
+                        memcpy(&eps, dst->op_params + 1, sizeof(float));

                        const int32_t n_groups = ((int32_t *) dst->op_params)[0];

@ -2829,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute(

        [encoder endEncoding];

-        [command_buffer commit];
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer commit];
+        }
    });

    // Wait for completion and check status of each command buffer
@ -2849,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute(

            return GGML_STATUS_FAILED;
        }
+
+        id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
+        if (!next_buffer) {
+            continue;
+        }
+
+        bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+        if (next_queued) {
+            continue;
+        }
+
+        if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+            GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+            return GGML_STATUS_ABORTED;
+        }
+
+        [next_buffer commit];
    }

    if (should_capture) {
@ -3152,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
 }

 GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ggml_metal_free(ctx);
    free(backend);
 }
@ -3164,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
 }

 GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;

    return ggml_metal_graph_compute(metal_ctx, cgraph);
 }

 GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;

    return ggml_metal_supports_op(metal_ctx, op);
 }
@ -3215,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
 }

 ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
    if (ctx == NULL) {
+        GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return NULL;
    }

@ -3239,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
    GGML_ASSERT(ggml_backend_is_metal(backend));

-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;

    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }

+void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = user_data;
+}
+
 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
    GGML_ASSERT(ggml_backend_is_metal(backend));

-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;

    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
@ -3255,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
 void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
    GGML_ASSERT(ggml_backend_is_metal(backend));

-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ctx->should_capture_next_compute = true;
 }

--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4003,7 +4003,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

@ -5488,7 +5488,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (ggml_sve_cnt_b == QK8_0) {
        svfloat32_t sumv0 = svdup_n_f32(0.0f);
        svfloat32_t sumv1 = svdup_n_f32(0.0f);

@ -7132,22 +7132,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            // compute mask for subtraction
            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
            m <<= 1;

            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
            m <<= 1;

            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
            m <<= 1;

            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
            m <<= 1;

            // load Q8 and take product with Q3
@ -8403,13 +8403,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
            m <<= 1;

            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
            m <<= 1;

            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@ -142,6 +142,10 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);

+#if defined(__ARM_FEATURE_SVE)
+extern int ggml_sve_cnt_b;
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
        return nullptr;
    }
+    if (inet_addr(host) == INADDR_NONE) {
+        fprintf(stderr, "Invalid host address: %s\n", host);
+        return nullptr;
+    }
    struct sockaddr_in serv_addr;
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_addr.s_addr = inet_addr(host);
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
        return nullptr;
    }
+
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+
    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
        result->op_params[i] = tensor->op_params[i];
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
    uint64_t offset;
    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
-    size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
+    const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);

    struct ggml_init_params params {
        /*.mem_size   =*/ ggml_tensor_overhead(),
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+
+    // sanitize tensor->data
+    {
+        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+        }
+    }
+
    const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
    ggml_backend_tensor_set(tensor, data, offset, size);
    ggml_free(ctx);
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+
+    // sanitize tensor->data
+    {
+        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+        }
+    }
+
    // output serialization format: | data (size bytes) |
    output.resize(size, 0);
    ggml_backend_tensor_get(tensor, output.data(), offset, size);
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
    ggml_sycl_func_t func;

    switch (tensor->op) {
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            func = ggml_sycl_op_conv_transpose_1d;
+            break;
        case GGML_OP_REPEAT:
            func = ggml_sycl_repeat;
            break;
@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
        case GGML_OP_ARGSORT:
            func = ggml_sycl_argsort;
            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            func = ggml_sycl_op_timestep_embedding;
+            break;
        default:
            return false;
    }
@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back

 GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
    switch (op->op) {
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                return false;
+            } break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_TIMESTEP_EMBEDDING:
            return true;
        default:
            return false;
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@ -15,6 +15,7 @@

 #include "concat.hpp"
 #include "common.hpp"
+#include "conv.hpp"
 #include "convert.hpp"
 #include "dequantize.hpp"
 #include "dmmv.hpp"
@ -23,5 +24,6 @@
 #include "rope.hpp"
 #include "norm.hpp"
 #include "softmax.hpp"
+#include "tsembd.hpp"

 #endif // GGML_SYCL_BACKEND_HPP
--- a/ggml/src/ggml-sycl/conv.cpp
+++ b/ggml/src/ggml-sycl/conv.cpp
@ -0,0 +1,99 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "conv.hpp"
+
+static  void conv_transpose_1d_kernel(
+        const int s0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2,
+        const int src1_ne0, const int dst_ne0,
+        const float * src0, const float * src1,  float * dst,
+        const sycl::nd_item<3> &item_ct1) {
+    int global_index = item_ct1.get_local_id(2) +
+                       item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (global_index >= output_size) {
+        return;
+    }
+
+    int out_index = global_index / dst_ne0;
+
+    float accumulator = 0;
+
+    for (int c = 0; c < src0_ne2; c++) {
+        int idx = global_index % dst_ne0;
+
+        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+        int input_offset = src1_ne0 * c;
+
+        for (int i = 0; i < src1_ne0; i++) {
+            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+                continue;
+            }
+            int weight_idx = idx - i*s0;
+
+            float kernel_weight = src0[kernel_offset + weight_idx];
+            float input_value =  src1[input_offset+i];
+
+            accumulator += kernel_weight * input_value;
+        }
+    }
+    dst[global_index] = accumulator;
+}
+
+static void conv_transpose_1d_f32_f32_sycl(
+    const int s0, const int output_size,
+    const int src0_ne0, const int src0_ne1, const int src0_ne2,
+    const int src1_ne0, const int dst_ne0,
+    const float *src0, const float *src1, float *dst,
+    const queue_ptr& stream) {
+
+    const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, 1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv_transpose_1d_kernel(
+                s0, output_size,
+                src0_ne0, src0_ne1, src0_ne2,
+                src1_ne0, dst_ne0,
+                src0, src1, dst, item_ct1);
+        });
+}
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor *dst) {
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    dpct::queue_ptr stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int s0 = opts[0];
+
+    const int64_t output_size = ggml_nelements(dst);
+
+    conv_transpose_1d_f32_f32_sycl(s0, output_size,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        src1->ne[0], dst->ne[0],
+        src0_d, src1_d, dst_d, stream);
+}
+
--- a/ggml/src/ggml-sycl/conv.hpp
+++ b/ggml/src/ggml-sycl/conv.hpp
@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONV_HPP
+#define GGML_SYCL_CONV_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+  const ggml_tensor *src1, ggml_tensor *dst);
+
+#endif // GGML_SYCL_CONV_HPP
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@ -874,7 +874,7 @@ namespace dpct
        inline std::string get_preferred_gpu_platform_name() {
            std::string result;

-            std::string filter = "level-zero";
+            std::string filter = "";
            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
            if (env) {
                if (std::strstr(env, "level_zero")) {
@ -892,11 +892,24 @@ namespace dpct
                else {
                    throw std::runtime_error("invalid device filter: " + std::string(env));
                }
+            } else {
+                auto default_device = sycl::device(sycl::default_selector_v);
+                auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
+
+                if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(default_platform_name.c_str(), "HIP")) {
+                    filter = "hip";
+                }
            }

-            auto plaform_list = sycl::platform::get_platforms();
+            auto platform_list = sycl::platform::get_platforms();

-            for (const auto& platform : plaform_list) {
+            for (const auto& platform : platform_list) {
                auto devices = platform.get_devices();
                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
                    return d.is_gpu();
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
+                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
 }

 static void group_norm_f32_sycl(const float* x, float* dst,
-    const int num_groups, const int group_size,
+    const int num_groups, const float eps, const int group_size,
    const int ne_elements, queue_ptr stream, int device) {
-    static const float eps = 1e-6f;
    if (group_size < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
        stream->submit([&](sycl::handler& cgh) {
@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    int num_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
+    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);

    (void)src1;
    (void)dst;
--- a/ggml/src/ggml-sycl/presets.hpp
+++ b/ggml/src/ggml-sycl/presets.hpp
@ -41,6 +41,8 @@
 #define SYCL_ACC_BLOCK_SIZE 256
 #define SYCL_IM2COL_BLOCK_SIZE 256
 #define SYCL_POOL2D_BLOCK_SIZE 256
+#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256

 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_SYCL_DMMV_X
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@ -0,0 +1,71 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "tsembd.hpp"
+
+static void timestep_embedding_f32(
+        const float * timesteps, float * dst, const int nb1,
+        const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
+    // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
+    // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
+    int i = item_ct1.get_group(1);
+    int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    float * embed_data = (float *)((char *)dst +  i*nb1);
+
+    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
+        embed_data[dim] = 0.f;
+    }
+
+    int half = dim / 2;
+    if (j >= half) {
+        return;
+    }
+
+    float timestep = timesteps[i];
+    float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
+    float arg = timestep * freq;
+    embed_data[j] = sycl::cos(arg);
+    embed_data[j + half] = sycl::sin(arg);
+}
+
+static void timestep_embedding_f32_sycl(
+        const float * x, float * dst, const int ne00, const int nb1,
+        const int dim, const int max_period, const queue_ptr& stream) {
+    // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
+    int half_ceil = dim / 2;
+    int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
+    sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
+    sycl::range<3> gridDim(1, ne00, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            gridDim * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            timestep_embedding_f32(
+                x, dst, nb1, dim, max_period, item_ct1
+            );
+        });
+}
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst) {
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    dpct::queue_ptr stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+
+    timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+}
--- a/ggml/src/ggml-sycl/tsembd.hpp
+++ b/ggml/src/ggml-sycl/tsembd.hpp
@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_TSEMBD_HPP
+#define GGML_SYCL_TSEMBD_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst);
+
+#endif // GGML_SYCL_TSEMBD_HPP
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -37,6 +37,9 @@
 #include <unistd.h>
 #endif

+#if defined(__ARM_FEATURE_SVE)
+int ggml_sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@ -53,6 +56,9 @@
 // disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
+
+// unreachable code because of multiple instances of code after GGML_ABORT
+#pragma warning(disable: 4702)
 #endif

 #if defined(_WIN32)
@ -141,7 +147,51 @@ typedef pthread_t ggml_thread_t;

 #include <sys/wait.h>

-#if defined(__linux__)
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
+
+struct backtrace_state {
+    void ** current;
+    void ** end;
+};
+
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+    struct backtrace_state * state = (struct backtrace_state *)arg;
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc) {
+        if (state->current == state->end) {
+            return _URC_END_OF_STACK;
+        } else {
+            *state->current++ = (void*)pc;
+        }
+    }
+    return _URC_NO_REASON;
+}
+
+static void ggml_print_backtrace_symbols(void) {
+    const int max = 100;
+    void* buffer[max];
+
+    struct backtrace_state state = {buffer, buffer + max};
+    _Unwind_Backtrace(unwind_callback, &state);
+
+    int count = state.current - buffer;
+
+    for (int idx = 0; idx < count; ++idx) {
+        const void * addr = buffer[idx];
+        const char * symbol = "";
+
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname) {
+            symbol = info.dli_sname;
+        }
+
+        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+    }
+}
+#elif defined(__linux__) && defined(__GLIBC__)
 #include <execinfo.h>
 static void ggml_print_backtrace_symbols(void) {
    void * trace[100];
@ -436,9 +486,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
    }
 }

+void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = ggml_compute_fp32_to_bf16(x[i]);
+    }
+}
+
 void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
  int i = 0;
 #if defined(__AVX512BF16__)
+  // subnormals are flushed to zero on this platform
  for (; i + 32 <= n; i += 32) {
        _mm512_storeu_si512(
            (__m512i *)(y + i),
@ -918,7 +975,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .is_quantized             = false,
        .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
-        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
        .vec_dot_type             = GGML_TYPE_BF16,
        .nrows                    = 1,
@ -2282,7 +2339,7 @@ inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) {
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@ -3531,6 +3588,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

    GGML_ASSERT_ALIGNED(ctx->mem_buffer);

+#if defined(__ARM_FEATURE_SVE)
+    if (!ggml_sve_cnt_b) {
+        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
+
    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

    ggml_critical_section_end();
@ -3685,7 +3748,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_tensor  * view_src,
        size_t                view_offs) {

-    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
+    GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);

    // find the base tensor and absolute offset
    if (view_src != NULL && view_src->view_src != NULL) {
@ -5338,6 +5402,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int n_groups,
+    float eps,
    bool inplace) {

    bool is_node = false;
@ -5348,7 +5413,8 @@ static struct ggml_tensor * ggml_group_norm_impl(

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

-    result->op_params[0] = n_groups;
+    ggml_set_op_params_i32(result, 0, n_groups);
+    ggml_set_op_params_f32(result, 1, eps);

    result->op = GGML_OP_GROUP_NORM;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5360,15 +5426,17 @@ static struct ggml_tensor * ggml_group_norm_impl(
 struct ggml_tensor * ggml_group_norm(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, false);
+    int n_groups,
+    float eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
 }

 struct ggml_tensor * ggml_group_norm_inplace(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, true);
+    int n_groups,
+    float eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }

 // ggml_mul_mat
@ -12068,10 +12136,11 @@ static void ggml_compute_forward_group_norm_f32(

    GGML_TENSOR_UNARY_OP_LOCALS

-    const float eps = 1e-6f; // TODO: make this a parameter
-
    // TODO: optimize

+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
    int n_channels = src0->ne[2];
    int n_groups = dst->op_params[0];
    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@ -20645,7 +20714,7 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_BF16:
            {
                size_t elemsize = sizeof(ggml_bf16_t);
-                ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
+                ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
                result = n * elemsize;
            } break;
        case GGML_TYPE_F32:
--- a/ggml/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/vulkan-shaders/CMakeLists.txt
@ -1,5 +1,7 @@
+find_package (Threads REQUIRED)

 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
--- a/ggml/src/vulkan-shaders/add.comp
+++ b/ggml/src/vulkan-shaders/add.comp
@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@ -4,10 +4,12 @@
 #include "generic_unary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
 }
--- a/ggml/src/vulkan-shaders/concat.comp
+++ b/ggml/src/vulkan-shaders/concat.comp
@ -0,0 +1,35 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    const int dim = p.param3;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
+    const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
+    const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
+    const uint i2_offset = i2*p.ne21*p.ne20;
+    const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
+    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
+
+    uint o[4] = {0, 0, 0, 0};
+    o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
+
+    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+    const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
+    const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
+
+    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
+#else
+    data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
+#endif
+}
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@ -4,13 +4,15 @@
 #include "generic_unary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
 #else
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
+    data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
 #endif
 }
--- a/ggml/src/vulkan-shaders/div.comp
+++ b/ggml/src/vulkan-shaders/div.comp
@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/gelu.comp
+++ b/ggml/src/vulkan-shaders/gelu.comp
@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/gelu_quick.comp
+++ b/ggml/src/vulkan-shaders/gelu_quick.comp
@ -0,0 +1,23 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const float GELU_QUICK_COEF = -1.702f;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float x = float(data_a[i]);
+    data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
+}
--- a/ggml/src/vulkan-shaders/generic_binary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_binary_head.comp
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
    uint d_offset;
-    float param1; float param2;
+    float param1; float param2; int param3;
 } p;

 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

+uint get_idx() {
+    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

+uint get_idx() {
+    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
--- a/ggml/src/vulkan-shaders/group_norm.comp
+++ b/ggml/src/vulkan-shaders/group_norm.comp
@ -0,0 +1,66 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+shared float tmp[BLOCK_SIZE];
+
+void main() {
+    const uint group_size = p.KX;
+    const float eps = p.param1;
+
+    const uint tid = gl_LocalInvocationID.x;
+    const uint start = gl_WorkGroupID.x * group_size + tid;
+    const uint end = start + group_size;
+
+    tmp[tid] = 0.0f;
+
+    // Calculate mean
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        tmp[tid] += float(data_a[col]);
+    }
+
+    // tmp up partial tmps and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    const float mean = tmp[0] / group_size;
+    barrier();
+    tmp[tid] = 0.0f;
+
+    // Calculate variance
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        const float xi = float(data_a[col]) - mean;
+        data_d[col] = D_TYPE(xi);
+        tmp[tid] += xi * xi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    const float variance = tmp[0] / group_size;
+    const float scale = inversesqrt(variance + eps);
+
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        data_d[col] *= D_TYPE(scale);
+    }
+}
--- a/ggml/src/vulkan-shaders/im2col.comp
+++ b/ggml/src/vulkan-shaders/im2col.comp
@ -0,0 +1,57 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint batch_offset; uint offset_delta;
+    uint IC;
+    uint IW; uint IH;
+    uint OW; uint OH;
+    uint KW; uint KH;
+    uint pelements;
+    uint CHW;
+    int s0; int s1;
+    int p0; int p1;
+    int d0; int d1;
+} p;
+
+#include "types.comp"
+
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x;
+    if (i >= p.pelements) {
+        return;
+    }
+
+    const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+    const uint kx = i / ksize;
+    const uint kd = kx * ksize;
+    const uint ky = (i - kd) / p.OW;
+    const uint ix = i % p.OW;
+
+    const uint oh = gl_GlobalInvocationID.y;
+    const uint batch = gl_GlobalInvocationID.z / p.IC;
+    const uint ic = gl_GlobalInvocationID.z % p.IC;
+
+    const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
+    const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
+
+    const uint offset_dst =
+        ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
+        (ic * (p.KW * p.KH) + ky * p.KW + kx);
+
+    if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
+        data_d[offset_dst] = D_TYPE(0.0f);
+    } else {
+        const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
+        data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
+    }
+}
--- a/ggml/src/vulkan-shaders/leaky_relu.comp
+++ b/ggml/src/vulkan-shaders/leaky_relu.comp
@ -0,0 +1,22 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float val = float(data_a[i]);
+    data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
+}
--- a/ggml/src/vulkan-shaders/mul.comp
+++ b/ggml/src/vulkan-shaders/mul.comp
@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp
@ -16,6 +16,13 @@ void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    const uint tid = gl_LocalInvocationID.x;

+    // There are not enough cols to use all threads
+    if (tid >= p.ncols) {
+        return;
+    }
+
+    const uint block_size = min(p.ncols, BLOCK_SIZE);
+
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);

@ -23,8 +30,8 @@ void main() {

    tmp[tid] = FLOAT_TYPE(0.0f);

-    [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
-        const uint col = i*BLOCK_SIZE + 2*tid;
+    [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
+        const uint col = i*block_size + 2*tid;
        const uint ib = (row*p.ncols + col)/QUANT_K; // block index
        const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
        const uint iybs = col - col%QUANT_K; // y block start index
@ -38,7 +45,7 @@ void main() {

    // sum up partial sums and write back result
    barrier();
-    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
--- a/ggml/src/vulkan-shaders/norm.comp
+++ b/ggml/src/vulkan-shaders/norm.comp
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 shared vec2 sum[BLOCK_SIZE];

 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint tid = gl_LocalInvocationID.x;

    sum[tid] = vec2(0.0f, 0.0f);
--- a/ggml/src/vulkan-shaders/pad.comp
+++ b/ggml/src/vulkan-shaders/pad.comp
@ -0,0 +1,26 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
+    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
+    const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
+    const uint i2_offset = i2*p.ne11*p.ne10;
+    const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
+    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
+
+    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+    const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
+
+    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
+}
--- a/ggml/src/vulkan-shaders/relu.comp
+++ b/ggml/src/vulkan-shaders/relu.comp
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

 void main() {
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/vulkan-shaders/rms_norm.comp
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 shared FLOAT_TYPE sum[BLOCK_SIZE];

 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint tid = gl_LocalInvocationID.x;

    sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@ -4,9 +4,11 @@
 #include "generic_unary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
 }
--- a/ggml/src/vulkan-shaders/silu.comp
+++ b/ggml/src/vulkan-shaders/silu.comp
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

 void main() {
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/soft_max.comp
+++ b/ggml/src/vulkan-shaders/soft_max.comp
@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];

 void main() {
    const uint tid = gl_LocalInvocationID.x;
-    const uint rowx = gl_WorkGroupID.x;
+    const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint rowy = rowx % p.KY;

    float slope = 1.0f;
--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@ -4,10 +4,12 @@
 #include "generic_unary_head.comp"

 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
        return;
    }

-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
 }
--- a/ggml/src/vulkan-shaders/sum_rows.comp
+++ b/ggml/src/vulkan-shaders/sum_rows.comp
@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
 shared FLOAT_TYPE tmp[BLOCK_SIZE];

 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint col = gl_LocalInvocationID.x;

    tmp[col] = FLOAT_TYPE(0.0f);
--- a/ggml/src/vulkan-shaders/tanh.comp
+++ b/ggml/src/vulkan-shaders/tanh.comp
@ -0,0 +1,21 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    data_d[i] = D_TYPE(tanh(data_a[i]));
+}
--- a/ggml/src/vulkan-shaders/timestep_embedding.comp
+++ b/ggml/src/vulkan-shaders/timestep_embedding.comp
@ -0,0 +1,41 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint nb1;
+    uint dim;
+    uint max_period;
+} p;
+
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.y;
+    const uint j = gl_GlobalInvocationID.x;
+    const uint d_offset = i * p.nb1;
+
+    if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
+        data_d[d_offset + p.dim] = 0.f;
+    }
+
+    const uint half_dim = p.dim / 2;
+    if (j >= half_dim) {
+        return;
+    }
+
+    const float timestep = float(data_a[i]);
+    const float freq = float(exp(-log(p.max_period) * j / half_dim));
+    const float arg = timestep * freq;
+    data_d[d_offset + j] = D_TYPE(cos(arg));
+    data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
+}
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@ -6,7 +6,7 @@
 #define QUANT_K 1
 #define QUANT_R 1

-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float
 #elif LOAD_VEC_A == 4
 #define A_TYPE vec4
@ -19,7 +19,7 @@
 #define QUANT_K 1
 #define QUANT_R 1

-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float16_t
 #elif LOAD_VEC_A == 4
 #define A_TYPE f16vec4
--- a/ggml/src/vulkan-shaders/upscale.comp
+++ b/ggml/src/vulkan-shaders/upscale.comp
@ -0,0 +1,36 @@
+#version 450
+
+layout (push_constant) uniform parameter
+{
+    uint ne; uint d_offset;
+    uint nb00; uint nb01; uint nb02; uint nb03;
+    uint ne10; uint ne11; uint ne12; uint ne13;
+    float sf0; float sf1; float sf2; float sf3;
+} p;
+
+#include "types.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i10 = idx % p.ne10;
+    const uint i11 = (idx / p.ne10) % p.ne11;
+    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
+    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
+
+    const uint i00 = uint(i10 / p.sf0);
+    const uint i01 = uint(i11 / p.sf1);
+    const uint i02 = uint(i12 / p.sf2);
+    const uint i03 = uint(i13 / p.sf3);
+
+    data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
+}
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -22,6 +22,7 @@
 #ifdef _WIN32
    #include <windows.h>
    #include <direct.h> // For _mkdir on Windows
+    #include <algorithm> // For std::replace on w64devkit
 #else
    #include <unistd.h>
    #include <sys/wait.h>
@ -30,20 +31,6 @@

 #define ASYNCIO_CONCURRENCY 64

-// define prototypes
-void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
-bool directory_exists(const std::string& path);
-bool create_directory(const std::string& path);
-std::string to_uppercase(const std::string& input);
-bool string_ends_with(const std::string& str, const std::string& suffix);
-std::string join_paths(const std::string& path1, const std::string& path2);
-std::string basename(const std::string &path);
-void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
-std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
-void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
-void process_shaders(std::vector<std::future<void>>& tasks);
-void write_output_files();
-
 std::mutex lock;
 std::vector<std::pair<std::string, std::string>> shader_fnames;

@ -52,7 +39,7 @@ std::string input_dir = "vulkan-shaders";
 std::string output_dir = "/tmp";
 std::string target_hpp = "ggml-vulkan-shaders.hpp";
 std::string target_cpp = "ggml-vulkan-shaders.cpp";
-bool clean = true;
+bool no_clean = false;

 const std::vector<std::string> type_names = {
    "f32",
@ -193,11 +180,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
    return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
 }

-#ifdef _WIN32
-    static const char path_separator = '\\';
-#else
-    static const char path_separator = '/';
-#endif
+static const char path_separator = '/';

 std::string join_paths(const std::string& path1, const std::string& path2) {
    return path1 + path_separator + path2;
@ -212,7 +195,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
    std::string out_fname = join_paths(output_dir, name + ".spv");
    std::string in_path = join_paths(input_dir, in_fname);

-    std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
+    #ifdef _WIN32
+        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
+    #else
+        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o",  out_fname};
+    #endif
    for (const auto& define : defines) {
        cmd.push_back("-D" + define.first + "=" + define.second);
    }
@ -283,9 +270,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu

    for (const auto& tname : type_names) {
        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+        // For unaligned, load one at a time for f32/f16, or two at a time for quants
+        std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
+        // For aligned matmul loads
        std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
        tasks.push_back(std::async(std::launch::async, [=] {
-            string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
+            string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
        }));
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
@ -354,6 +344,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
+    tasks.push_back(std::async(std::launch::async, [=] {
+        string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
@ -371,6 +364,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
+    }));

    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@ -396,15 +392,42 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
        string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));

+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));
+
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
+    }));
+
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));
+
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    }));

    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@ -438,6 +461,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
+
+    tasks.push_back(std::async(std::launch::async, [=] {
+        string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    }));
+    tasks.push_back(std::async(std::launch::async, [=] {
+        string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
+    }));
+
+    tasks.push_back(std::async(std::launch::async, [=] {
+        string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    }));
 }

 void write_output_files() {
@ -449,10 +483,16 @@ void write_output_files() {

    for (const auto& pair : shader_fnames) {
        const std::string& name = pair.first;
-        const std::string& path = pair.second;
+        #ifdef _WIN32
+            std::string path = pair.second;
+            std::replace(path.begin(), path.end(), '/', '\\' );
+        #else
+            const std::string& path = pair.second;
+        #endif
+
        FILE* spv = fopen(path.c_str(), "rb");
        if (!spv) {
-            std::cerr << "Error opening SPIR-V file: " << path << "\n";
+            std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
            continue;
        }

@ -464,7 +504,7 @@ void write_output_files() {
        size_t read_size = fread(data.data(), 1, size, spv);
        fclose(spv);
        if (read_size != size) {
-            std::cerr << "Error reading SPIR-V file: " << path << "\n";
+            std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
            continue;
        }

@ -478,9 +518,8 @@ void write_output_files() {
        }
        fprintf(src, "\n};\n\n");

-        if (clean) {
+        if (!no_clean) {
            std::remove(path.c_str());
-            // fprintf(stderr, "Removed: %s\n", path.c_str());
        }
    }

@ -496,18 +535,6 @@ int main(int argc, char** argv) {
        }
    }

-    if (argc <= 1 || args.find("--help") != args.end()) {
-        std::cout << "Usage:\n"
-                     "\tvulkan-shaders-gen [options]\n\n"
-                     "Options:\n"
-                     "\t--glslc <path>        Path to glslc executable (default: /usr/bin/glslc)\n"
-                     "\t--input-dir           Directory containing shader sources (required)\n"
-                     "\t--output-dir          Output directory for generated SPIR-V files and optional C++ headers\n"
-                     "\t--target-hpp <path>   Path to generate a header file with shader declarations in C++ format\n"
-                     "\t--target-cpp <path>   Path to generate a source code file implementing the declared shaders (optional)\n"
-                     "\t--no-clean            Keep temporary SPIR-V files after build (default: remove them)\n";
-        return EXIT_SUCCESS;
-    }
    if (args.find("--glslc") != args.end()) {
        GLSLC = args["--glslc"]; // Path to glslc
    }
@ -524,7 +551,7 @@ int main(int argc, char** argv) {
        target_cpp = args["--target-cpp"]; // Path to generated cpp file
    }
    if (args.find("--no-clean") != args.end()) {
-        clean = false; // Keep temporary SPIR-V files in output-dir after build
+        no_clean = true; // Keep temporary SPIR-V files in output-dir after build
    }

    if (!directory_exists(input_dir)) {