Merge remote-tracking branch 'origin/master' into dev-refactoring

2024-07-31 20:25:28 +08:00 · 2024-07-31 20:25:28 +08:00 · 6cc7432b37
commit 6cc7432b37
parent 74eb05a13b 268c566006
18 changed files with 667 additions and 400 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -126,16 +126,9 @@ let
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
+    cuda_cudart
-
+    cuda_cccl # <nv/target>
-    # A temporary hack for reducing the closure size, remove once cudaPackages
+    libcublas
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1721379653,
+        "lastModified": 1722062969,
-        "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
+        "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
+        "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3",
        "type": "github"
      },
      "original": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -208,6 +208,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml.h
    include/ggml-kompute.h
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -849,11 +849,6 @@ if (GGML_CANN)
                ${CANN_INSTALL_DIR}/acllib/include
            )
            # TODO: find libs
            link_directories(
                ${CANN_INSTALL_DIR}/lib64
            )
            add_subdirectory(ggml-cann/kernels)
            list(APPEND CANN_LIBRARIES
                ascendcl
@ -872,6 +867,7 @@ if (GGML_CANN)
            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
            set(GGML_EXTRA_LIBDIRS  ${GGML_EXTRA_LIBDIRS}  ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -27,255 +27,11 @@
 #include <vector>
 #if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
+#include "vendors/hip.h"
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaHostRegister hipHostRegister
 #define cudaHostRegisterPortable hipHostRegisterPortable
 #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 #define cudaHostUnregister hipHostUnregister
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
 #define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaStreamFireAndForget hipStreamFireAndForget
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
 #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
 #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
 #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
 #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
 #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 #elif defined(GGML_USE_MUSA)
-#include <musa_runtime.h>
+#include "vendors/musa.h"
 #include <musa.h>
 #include <mublas.h>
 #include <musa_fp16.h>
 // XXX: Keep the following order the same as hipBLAS
 // #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
 // #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
 #define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 // #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_32F  MUSA_R_32F
 // #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 // #define cublasComputeType_t mublasComputeType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
 #define cublasGemmEx mublasGemmEx
 #define cublasGemmBatchedEx mublasGemmBatchedEx
 #define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
 #define cublasHandle_t mublasHandle_t
 // #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetMathMode mublasSetMathMode
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
 #define cublasStatus_t mublasStatus_t
 #define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
 #define cudaDeviceProp musaDeviceProp
 #define cudaDeviceSynchronize musaDeviceSynchronize
 #define cudaError_t musaError_t
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
 #define cudaEventDisableTiming musaEventDisableTiming
 #define cudaEventRecord musaEventRecord
 #define cudaEventSynchronize musaEventSynchronize
 #define cudaEvent_t musaEvent_t
 #define cudaEventDestroy musaEventDestroy
 #define cudaFree musaFree
 #define cudaFreeHost musaFreeHost
 #define cudaGetDevice musaGetDevice
 #define cudaGetDeviceCount musaGetDeviceCount
 #define cudaGetDeviceProperties musaGetDeviceProperties
 #define cudaGetErrorString musaGetErrorString
 #define cudaGetLastError musaGetLastError
 #define cudaHostRegister musaHostRegister
 #define cudaHostRegisterPortable musaHostRegisterPortable
 #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
 #define cudaHostUnregister musaHostUnregister
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost
 #define cudaMemcpy musaMemcpy
 #define cudaMemcpyAsync musaMemcpyAsync
 #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
 #define cudaMemcpy2DAsync musaMemcpy2DAsync
 #define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice musaMemcpyHostToDevice
 #define cudaMemcpyKind musaMemcpyKind
 #define cudaMemset musaMemset
 #define cudaMemsetAsync musaMemsetAsync
 #define cudaMemGetInfo musaMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
 #define cudaSetDevice musaSetDevice
 #define cudaStreamCreateWithFlags musaStreamCreateWithFlags
 #define cudaStreamDestroy musaStreamDestroy
 #define cudaStreamFireAndForget musaStreamFireAndForget
 #define cudaStreamNonBlocking musaStreamNonBlocking
 #define cudaStreamPerThread musaStreamPerThread
 #define cudaStreamSynchronize musaStreamSynchronize
 #define cudaStreamWaitEvent musaStreamWaitEvent
 #define cudaStream_t musaStream_t
 #define cudaSuccess musaSuccess
 // XXX: Other CUDA => MUSA mapping
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
 #define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
 #define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
 #define CUdevice MUdevice
 #define CUdeviceptr MUdeviceptr
 #define CUmemAccessDesc MUmemAccessDesc
 #define CUmemAllocationProp MUmemAllocationProp
 #define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
 #define cuDeviceGet muDeviceGet
 #define cuDeviceGetAttribute muDeviceGetAttribute
 #define cuMemAddressFree muMemAddressFree
 #define cuMemAddressReserve muMemAddressReserve
 #define cuMemCreate muMemCreate
 #define cuMemGetAllocationGranularity muMemGetAllocationGranularity
 #define cuMemMap muMemMap
 #define cuMemRelease muMemRelease
 #define cuMemSetAccess muMemSetAccess
 #define cuMemUnmap muMemUnmap
 #define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
 #define cudaFuncSetAttribute musaFuncSetAttribute
 #define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
 #define make_cudaExtent make_musaExtent
 #define make_cudaPitchedPtr make_musaPitchedPtr
 // XXX: USE_CUDA_GRAPH
 #define CUDA_SUCCESS MUSA_SUCCESS
 #define CUresult MUresult
 #define cuGetErrorString muGetErrorString
 #define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
 #define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
 #define cudaGraphDestroy musaGraphDestroy
 #define cudaGraphExecDestroy musaGraphExecDestroy
 #define cudaGraphExec_t musaGraphExec_t
 #define cudaGraphExecUpdate musaGraphExecUpdate
 #define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
 #define cudaGraphGetNodes musaGraphGetNodes
 #define cudaGraphInstantiate musaGraphInstantiate
 #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
 #define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
 #define cudaGraphLaunch musaGraphLaunch
 #define cudaGraphNodeGetType musaGraphNodeGetType
 #define cudaGraphNode_t musaGraphNode_t
 #define cudaGraphNodeType musaGraphNodeType
 #define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
 #define cudaGraph_t musaGraph_t
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
 // XXX: cuBLAS => muBLAS mapping
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
 #define CUBLAS_COMPUTE_16F CUDA_R_16F
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define cublasComputeType_t cudaDataType_t
 // XXX: Clang builtins mapping
 #define __vsub4   __vsub4_musa
 #define __vcmpeq4 __vcmpeq4_musa
 #define __vcmpne4 __vcmpne4_musa
 #else
-#include <cuda_runtime.h>
+#include "vendors/cuda.h"
 #include <cuda.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #if CUDART_VERSION < 11020
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
 #define CUBLAS_COMPUTE_16F CUDA_R_16F
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define cublasComputeType_t cudaDataType_t
 #endif // CUDART_VERSION < 11020
 #endif // defined(GGML_USE_HIPBLAS)
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
    static const char * cublas_get_error_str(const cublasStatus_t err) {
 #ifndef GGML_USE_MUSA
        return cublasGetStatusString(err);
 #else
        return mublasStatus_to_string(err);
 #endif // GGML_USE_MUSA
    }
 #else
    static const char * cublas_get_error_str(const cublasStatus_t err) {
@ -364,129 +116,7 @@ typedef half2 dfloat2;
 #else
 typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
+#endif // GGML_CUDA_F16
 #if defined(GGML_USE_MUSA)
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
    return __vsubss4(a, b);
 }
 static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
 }
 static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
    }
    return c;
 }
 #endif // defined(GGML_USE_MUSA)
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
 #define RDNA2
 #endif
 #if defined(__gfx1010__) || defined(__gfx1012__)
 #define RDNA1
 #endif
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 #if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int &>(c);
 #else
    int8x4_t c;
    int16_t tmp;
 #pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int &>(c);
 #endif // __has_builtin(__builtin_elementwise_sub_sat)
 }
 static __device__ __forceinline__ int __vsub4(const int a, const int b) {
    return __vsubss4(a, b);
 }
 static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
 }
 static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
    }
    return c;
 }
 #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 // __shfl_xor() for half2 was added in ROCm 5.6
 static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
    typedef union half2_b32 {
        half2 val;
        int   b32;
    } half2_b32_t;
    half2_b32_t tmp;
    tmp.val = var;
    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
    return tmp.val;
 }
 #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 #endif // defined(GGML_USE_HIPBLAS)
 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 #define FP16_AVAILABLE
--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@ -0,0 +1,14 @@
 #pragma once
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #if CUDART_VERSION < 11020
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
 #define CUBLAS_COMPUTE_16F CUDA_R_16F
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define cublasComputeType_t cudaDataType_t
 #endif // CUDART_VERSION < 11020
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -0,0 +1,177 @@
 #pragma once
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaHostRegister hipHostRegister
 #define cudaHostRegisterPortable hipHostRegisterPortable
 #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 #define cudaHostUnregister hipHostUnregister
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
 #define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaStreamFireAndForget hipStreamFireAndForget
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
 #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
 #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
 #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
 #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
 #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 #define __CUDA_ARCH__ 1300
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
    defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
 #define RDNA2
 #endif
 #if defined(__gfx1010__) || defined(__gfx1012__)
 #define RDNA1
 #endif
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 #if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int &>(c);
 #else
    int8x4_t c;
    int16_t tmp;
 #pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int &>(c);
 #endif // __has_builtin(__builtin_elementwise_sub_sat)
 }
 static __device__ __forceinline__ int __vsub4(const int a, const int b) {
    return __vsubss4(a, b);
 }
 static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
 }
 static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
    }
    return c;
 }
 #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 // __shfl_xor() for half2 was added in ROCm 5.6
 static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
    typedef union half2_b32 {
        half2 val;
        int   b32;
    } half2_b32_t;
    half2_b32_t tmp;
    tmp.val = var;
    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
    return tmp.val;
 }
 #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -0,0 +1,171 @@
 #pragma once
 #include <musa_runtime.h>
 #include <musa.h>
 #include <mublas.h>
 #include <musa_fp16.h>
 #define CUBLAS_COMPUTE_16F CUDA_R_16F
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
 #define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_32F  MUSA_R_32F
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
 #define cublasGemmEx mublasGemmEx
 #define cublasGemmBatchedEx mublasGemmBatchedEx
 #define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
 #define cublasHandle_t mublasHandle_t
 #define cublasSetMathMode mublasSetMathMode
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
 #define cublasStatus_t mublasStatus_t
 #define cublasGetStatusString mublasStatus_to_string
 #define cudaDataType_t musaDataType_t
 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
 #define cudaDeviceProp musaDeviceProp
 #define cudaDeviceSynchronize musaDeviceSynchronize
 #define cudaError_t musaError_t
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
 #define cudaEventDisableTiming musaEventDisableTiming
 #define cudaEventRecord musaEventRecord
 #define cudaEventSynchronize musaEventSynchronize
 #define cudaEvent_t musaEvent_t
 #define cudaEventDestroy musaEventDestroy
 #define cudaFree musaFree
 #define cudaFreeHost musaFreeHost
 #define cudaGetDevice musaGetDevice
 #define cudaGetDeviceCount musaGetDeviceCount
 #define cudaGetDeviceProperties musaGetDeviceProperties
 #define cudaGetErrorString musaGetErrorString
 #define cudaGetLastError musaGetLastError
 #define cudaHostRegister musaHostRegister
 #define cudaHostRegisterPortable musaHostRegisterPortable
 #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
 #define cudaHostUnregister musaHostUnregister
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost
 #define cudaMemcpy musaMemcpy
 #define cudaMemcpyAsync musaMemcpyAsync
 #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
 #define cudaMemcpy2DAsync musaMemcpy2DAsync
 #define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice musaMemcpyHostToDevice
 #define cudaMemcpyKind musaMemcpyKind
 #define cudaMemset musaMemset
 #define cudaMemsetAsync musaMemsetAsync
 #define cudaMemGetInfo musaMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
 #define cudaSetDevice musaSetDevice
 #define cudaStreamCreateWithFlags musaStreamCreateWithFlags
 #define cudaStreamDestroy musaStreamDestroy
 #define cudaStreamFireAndForget musaStreamFireAndForget
 #define cudaStreamNonBlocking musaStreamNonBlocking
 #define cudaStreamPerThread musaStreamPerThread
 #define cudaStreamSynchronize musaStreamSynchronize
 #define cudaStreamWaitEvent musaStreamWaitEvent
 #define cudaStream_t musaStream_t
 #define cudaSuccess musaSuccess
 // Additional mappings for MUSA virtual memory pool
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
 #define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
 #define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
 #define CUdevice MUdevice
 #define CUdeviceptr MUdeviceptr
 #define CUmemAccessDesc MUmemAccessDesc
 #define CUmemAllocationProp MUmemAllocationProp
 #define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
 #define cuDeviceGet muDeviceGet
 #define cuDeviceGetAttribute muDeviceGetAttribute
 #define cuMemAddressFree muMemAddressFree
 #define cuMemAddressReserve muMemAddressReserve
 #define cuMemCreate muMemCreate
 #define cuMemGetAllocationGranularity muMemGetAllocationGranularity
 #define cuMemMap muMemMap
 #define cuMemRelease muMemRelease
 #define cuMemSetAccess muMemSetAccess
 #define cuMemUnmap muMemUnmap
 #define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
 #define cudaFuncSetAttribute musaFuncSetAttribute
 #define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
 #define make_cudaExtent make_musaExtent
 #define make_cudaPitchedPtr make_musaPitchedPtr
 // Additional mappings for MUSA graphs
 #define CUDA_SUCCESS MUSA_SUCCESS
 #define CUresult MUresult
 #define cuGetErrorString muGetErrorString
 #define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
 #define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
 #define cudaGraphDestroy musaGraphDestroy
 #define cudaGraphExecDestroy musaGraphExecDestroy
 #define cudaGraphExec_t musaGraphExec_t
 #define cudaGraphExecUpdate musaGraphExecUpdate
 #define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
 #define cudaGraphGetNodes musaGraphGetNodes
 #define cudaGraphInstantiate musaGraphInstantiate
 #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
 #define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
 #define cudaGraphLaunch musaGraphLaunch
 #define cudaGraphNodeGetType musaGraphNodeGetType
 #define cudaGraphNode_t musaGraphNode_t
 #define cudaGraphNodeType musaGraphNodeType
 #define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
 #define cudaGraph_t musaGraph_t
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
 // XXX: Clang builtins mapping
 #define __vsub4   __vsub4_musa
 #define __vcmpeq4 __vcmpeq4_musa
 #define __vcmpne4 __vcmpne4_musa
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
    return __vsubss4(a, b);
 }
 static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
    }
    return c;
 }
 static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
    unsigned int c;
    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
 #pragma unroll
    for (int i = 0; i < 4; ++i) {
        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
    }
    return c;
 }
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            // compute mask for subtraction
            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
            m <<= 1;
            // load Q8 and take product with Q3
@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
            m <<= 1;
            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
            m <<= 1;
            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
    ggml_sycl_func_t func;
    switch (tensor->op) {
        case GGML_OP_CONV_TRANSPOSE_1D:
            func = ggml_sycl_op_conv_transpose_1d;
            break;
        case GGML_OP_REPEAT:
            func = ggml_sycl_repeat;
            break;
@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
        case GGML_OP_ARGSORT:
            func = ggml_sycl_argsort;
            break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            func = ggml_sycl_op_timestep_embedding;
            break;
        default:
            return false;
    }
@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
 GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                ggml_type src0_type = op->src[0]->type;
                ggml_type src1_type = op->src[1]->type;
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                return false;
            } break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_TIMESTEP_EMBEDDING:
            return true;
        default:
            return false;
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@ -15,6 +15,7 @@
 #include "concat.hpp"
 #include "common.hpp"
 #include "conv.hpp"
 #include "convert.hpp"
 #include "dequantize.hpp"
 #include "dmmv.hpp"
@ -23,5 +24,6 @@
 #include "rope.hpp"
 #include "norm.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #endif // GGML_SYCL_BACKEND_HPP
--- a/ggml/src/ggml-sycl/conv.cpp
+++ b/ggml/src/ggml-sycl/conv.cpp
@ -0,0 +1,99 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #include "conv.hpp"
 static  void conv_transpose_1d_kernel(
        const int s0, const int output_size,
        const int src0_ne0, const int src0_ne1, const int src0_ne2,
        const int src1_ne0, const int dst_ne0,
        const float * src0, const float * src1,  float * dst,
        const sycl::nd_item<3> &item_ct1) {
    int global_index = item_ct1.get_local_id(2) +
                       item_ct1.get_group(2) * item_ct1.get_local_range(2);
    if (global_index >= output_size) {
        return;
    }
    int out_index = global_index / dst_ne0;
    float accumulator = 0;
    for (int c = 0; c < src0_ne2; c++) {
        int idx = global_index % dst_ne0;
        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
        int input_offset = src1_ne0 * c;
        for (int i = 0; i < src1_ne0; i++) {
            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
                continue;
            }
            int weight_idx = idx - i*s0;
            float kernel_weight = src0[kernel_offset + weight_idx];
            float input_value =  src1[input_offset+i];
            accumulator += kernel_weight * input_value;
        }
    }
    dst[global_index] = accumulator;
 }
 static void conv_transpose_1d_f32_f32_sycl(
    const int s0, const int output_size,
    const int src0_ne0, const int src0_ne1, const int src0_ne2,
    const int src1_ne0, const int dst_ne0,
    const float *src0, const float *src1, float *dst,
    const queue_ptr& stream) {
    const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
    const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
    const sycl::range<3> block_nums(1, 1, num_blocks);
    stream->parallel_for(
        sycl::nd_range<3>(
            block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) {
            conv_transpose_1d_kernel(
                s0, output_size,
                src0_ne0, src0_ne1, src0_ne2,
                src1_ne0, dst_ne0,
                src0, src1, dst, item_ct1);
        });
 }
 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
    const ggml_tensor *src1, ggml_tensor *dst) {
    const float * src0_d = (const float *)src0->data;
    const float * src1_d = (const float *)src1->data;
    float * dst_d = (float *)dst->data;
    dpct::queue_ptr stream = ctx.stream();
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    const int32_t * opts = (const int32_t *)dst->op_params;
    const int s0 = opts[0];
    const int64_t output_size = ggml_nelements(dst);
    conv_transpose_1d_f32_f32_sycl(s0, output_size,
        src0->ne[0], src0->ne[1], src0->ne[2],
        src1->ne[0], dst->ne[0],
        src0_d, src1_d, dst_d, stream);
 }
--- a/ggml/src/ggml-sycl/conv.hpp
+++ b/ggml/src/ggml-sycl/conv.hpp
@ -0,0 +1,21 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #ifndef GGML_SYCL_CONV_HPP
 #define GGML_SYCL_CONV_HPP
 #include "common.hpp"
 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
  const ggml_tensor *src1, ggml_tensor *dst);
 #endif // GGML_SYCL_CONV_HPP
--- a/ggml/src/ggml-sycl/presets.hpp
+++ b/ggml/src/ggml-sycl/presets.hpp
@ -41,6 +41,8 @@
 #define SYCL_ACC_BLOCK_SIZE 256
 #define SYCL_IM2COL_BLOCK_SIZE 256
 #define SYCL_POOL2D_BLOCK_SIZE 256
 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
 #define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_SYCL_DMMV_X
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@ -0,0 +1,71 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #include "tsembd.hpp"
 static void timestep_embedding_f32(
        const float * timesteps, float * dst, const int nb1,
        const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
    // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
    // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
    int i = item_ct1.get_group(1);
    int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
    float * embed_data = (float *)((char *)dst +  i*nb1);
    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
        embed_data[dim] = 0.f;
    }
    int half = dim / 2;
    if (j >= half) {
        return;
    }
    float timestep = timesteps[i];
    float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
    float arg = timestep * freq;
    embed_data[j] = sycl::cos(arg);
    embed_data[j + half] = sycl::sin(arg);
 }
 static void timestep_embedding_f32_sycl(
        const float * x, float * dst, const int ne00, const int nb1,
        const int dim, const int max_period, const queue_ptr& stream) {
    // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
    int half_ceil = dim / 2;
    int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
    sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
    sycl::range<3> gridDim(1, ne00, num_blocks);
    stream->parallel_for(
        sycl::nd_range<3>(
            gridDim * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) {
            timestep_embedding_f32(
                x, dst, nb1, dim, max_period, item_ct1
            );
        });
 }
 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
    const ggml_tensor *src1, ggml_tensor * dst) {
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    dpct::queue_ptr stream = ctx.stream();
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];
    timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
 }
--- a/ggml/src/ggml-sycl/tsembd.hpp
+++ b/ggml/src/ggml-sycl/tsembd.hpp
@ -0,0 +1,21 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #ifndef GGML_SYCL_TSEMBD_HPP
 #define GGML_SYCL_TSEMBD_HPP
 #include "common.hpp"
 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
    const ggml_tensor *src1, ggml_tensor * dst);
 #endif // GGML_SYCL_TSEMBD_HPP
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -141,7 +141,51 @@ typedef pthread_t ggml_thread_t;
 #include <sys/wait.h>
-#if defined(__linux__)
+#if defined(__ANDROID__)
 #include <unwind.h>
 #include <dlfcn.h>
 #include <stdio.h>
 struct backtrace_state {
    void ** current;
    void ** end;
 };
 static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
    struct backtrace_state * state = (struct backtrace_state *)arg;
    uintptr_t pc = _Unwind_GetIP(context);
    if (pc) {
        if (state->current == state->end) {
            return _URC_END_OF_STACK;
        } else {
            *state->current++ = (void*)pc;
        }
    }
    return _URC_NO_REASON;
 }
 static void ggml_print_backtrace_symbols(void) {
    const int max = 100;
    void* buffer[max];
    struct backtrace_state state = {buffer, buffer + max};
    _Unwind_Backtrace(unwind_callback, &state);
    int count = state.current - buffer;
    for (int idx = 0; idx < count; ++idx) {
        const void * addr = buffer[idx];
        const char * symbol = "";
        Dl_info info;
        if (dladdr(addr, &info) && info.dli_sname) {
            symbol = info.dli_sname;
        }
        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
    }
 }
 #elif defined(__linux__)
 #include <execinfo.h>
 static void ggml_print_backtrace_symbols(void) {
    void * trace[100];
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -312,6 +312,8 @@ class GGUFWriter:
        self.add_key_value(key, val, GGUFValueType.STRING)
    def add_array(self, key: str, val: Sequence[Any]) -> None:
        if len(val) == 0:
            return
        self.add_key_value(key, val, GGUFValueType.ARRAY)
    @staticmethod
@ -845,7 +847,14 @@ class GGUFWriter:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            kv_data += self._pack("Q", len(encoded_val))
            kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
+        elif vtype == GGUFValueType.ARRAY:
            if not isinstance(val, Sequence):
                raise ValueError("Invalid GGUF metadata array, expecting sequence")
            if len(val) == 0:
                raise ValueError("Invalid GGUF metadata array. Empty array")
            if isinstance(val, bytes):
                ltype = GGUFValueType.UINT8
            else: