Merge remote-tracking branch 'origin/master' into dev-refactoring
This commit is contained in:
commit
6cc7432b37
18 changed files with 667 additions and 400 deletions
|
@ -126,16 +126,9 @@ let
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
++ optionals useMetalKit [ MetalKit ];
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
cudaBuildInputs = with cudaPackages; [
|
||||||
cuda_cccl.dev # <nv/target>
|
cuda_cudart
|
||||||
|
cuda_cccl # <nv/target>
|
||||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
libcublas
|
||||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
|
||||||
cuda_cudart.dev
|
|
||||||
cuda_cudart.lib
|
|
||||||
cuda_cudart.static
|
|
||||||
libcublas.dev
|
|
||||||
libcublas.lib
|
|
||||||
libcublas.static
|
|
||||||
];
|
];
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
rocmBuildInputs = with rocmPackages; [
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1721379653,
|
"lastModified": 1722062969,
|
||||||
"narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
|
"narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
|
"rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -208,6 +208,7 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-alloc.h
|
include/ggml-alloc.h
|
||||||
include/ggml-backend.h
|
include/ggml-backend.h
|
||||||
include/ggml-blas.h
|
include/ggml-blas.h
|
||||||
|
include/ggml-cann.h
|
||||||
include/ggml-cuda.h
|
include/ggml-cuda.h
|
||||||
include/ggml.h
|
include/ggml.h
|
||||||
include/ggml-kompute.h
|
include/ggml-kompute.h
|
||||||
|
|
|
@ -849,11 +849,6 @@ if (GGML_CANN)
|
||||||
${CANN_INSTALL_DIR}/acllib/include
|
${CANN_INSTALL_DIR}/acllib/include
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: find libs
|
|
||||||
link_directories(
|
|
||||||
${CANN_INSTALL_DIR}/lib64
|
|
||||||
)
|
|
||||||
|
|
||||||
add_subdirectory(ggml-cann/kernels)
|
add_subdirectory(ggml-cann/kernels)
|
||||||
list(APPEND CANN_LIBRARIES
|
list(APPEND CANN_LIBRARIES
|
||||||
ascendcl
|
ascendcl
|
||||||
|
@ -872,6 +867,7 @@ if (GGML_CANN)
|
||||||
|
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||||
|
set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
|
||||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
|
|
|
@ -27,255 +27,11 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#include <hip/hip_runtime.h>
|
#include "vendors/hip.h"
|
||||||
#include <hipblas/hipblas.h>
|
|
||||||
#include <hip/hip_fp16.h>
|
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
|
||||||
// for rocblas_initialize()
|
|
||||||
#include "rocblas/rocblas.h"
|
|
||||||
#endif // __HIP_PLATFORM_AMD__
|
|
||||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
|
||||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
||||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
|
||||||
#define cublasCreate hipblasCreate
|
|
||||||
#define cublasDestroy hipblasDestroy
|
|
||||||
#define cublasGemmEx hipblasGemmEx
|
|
||||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
||||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
|
||||||
#define cublasHandle_t hipblasHandle_t
|
|
||||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
||||||
#define cublasSetStream hipblasSetStream
|
|
||||||
#define cublasSgemm hipblasSgemm
|
|
||||||
#define cublasStatus_t hipblasStatus_t
|
|
||||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
|
||||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
||||||
#define cudaError_t hipError_t
|
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
||||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
|
||||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
|
||||||
#define cudaEventDisableTiming hipEventDisableTiming
|
|
||||||
#define cudaEventRecord hipEventRecord
|
|
||||||
#define cudaEventSynchronize hipEventSynchronize
|
|
||||||
#define cudaEvent_t hipEvent_t
|
|
||||||
#define cudaEventDestroy hipEventDestroy
|
|
||||||
#define cudaFree hipFree
|
|
||||||
#define cudaFreeHost hipHostFree
|
|
||||||
#define cudaGetDevice hipGetDevice
|
|
||||||
#define cudaGetDeviceCount hipGetDeviceCount
|
|
||||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
|
||||||
#define cudaGetErrorString hipGetErrorString
|
|
||||||
#define cudaGetLastError hipGetLastError
|
|
||||||
#define cudaHostRegister hipHostRegister
|
|
||||||
#define cudaHostRegisterPortable hipHostRegisterPortable
|
|
||||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
|
||||||
#define cudaHostUnregister hipHostUnregister
|
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
||||||
#define cudaMalloc hipMalloc
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
||||||
#define cudaMemcpy hipMemcpy
|
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
|
||||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
|
||||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
|
||||||
#define cudaMemset hipMemset
|
|
||||||
#define cudaMemsetAsync hipMemsetAsync
|
|
||||||
#define cudaMemGetInfo hipMemGetInfo
|
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
|
||||||
#define cudaSetDevice hipSetDevice
|
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
||||||
#define cudaStreamDestroy hipStreamDestroy
|
|
||||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
||||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
|
||||||
#define cudaStreamPerThread hipStreamPerThread
|
|
||||||
#define cudaStreamSynchronize hipStreamSynchronize
|
|
||||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
|
||||||
#define cudaStream_t hipStream_t
|
|
||||||
#define cudaSuccess hipSuccess
|
|
||||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
|
||||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
|
||||||
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
|
||||||
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
|
||||||
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
|
||||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
|
||||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
|
||||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
#include <musa_runtime.h>
|
#include "vendors/musa.h"
|
||||||
#include <musa.h>
|
|
||||||
#include <mublas.h>
|
|
||||||
#include <musa_fp16.h>
|
|
||||||
// XXX: Keep the following order the same as hipBLAS
|
|
||||||
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
|
|
||||||
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
|
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
|
||||||
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
|
||||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
|
||||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
|
||||||
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
||||||
#define CUDA_R_16F MUSA_R_16F
|
|
||||||
#define CUDA_R_32F MUSA_R_32F
|
|
||||||
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
||||||
// #define cublasComputeType_t mublasComputeType_t
|
|
||||||
#define cublasCreate mublasCreate
|
|
||||||
#define cublasDestroy mublasDestroy
|
|
||||||
#define cublasGemmEx mublasGemmEx
|
|
||||||
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
|
||||||
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
|
||||||
#define cublasHandle_t mublasHandle_t
|
|
||||||
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
||||||
#define cublasSetMathMode mublasSetMathMode
|
|
||||||
#define cublasSetStream mublasSetStream
|
|
||||||
#define cublasSgemm mublasSgemm
|
|
||||||
#define cublasStatus_t mublasStatus_t
|
|
||||||
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
|
|
||||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
|
||||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
|
||||||
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
|
||||||
#define cudaDeviceProp musaDeviceProp
|
|
||||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
|
||||||
#define cudaError_t musaError_t
|
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
|
||||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
|
||||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
|
||||||
#define cudaEventDisableTiming musaEventDisableTiming
|
|
||||||
#define cudaEventRecord musaEventRecord
|
|
||||||
#define cudaEventSynchronize musaEventSynchronize
|
|
||||||
#define cudaEvent_t musaEvent_t
|
|
||||||
#define cudaEventDestroy musaEventDestroy
|
|
||||||
#define cudaFree musaFree
|
|
||||||
#define cudaFreeHost musaFreeHost
|
|
||||||
#define cudaGetDevice musaGetDevice
|
|
||||||
#define cudaGetDeviceCount musaGetDeviceCount
|
|
||||||
#define cudaGetDeviceProperties musaGetDeviceProperties
|
|
||||||
#define cudaGetErrorString musaGetErrorString
|
|
||||||
#define cudaGetLastError musaGetLastError
|
|
||||||
#define cudaHostRegister musaHostRegister
|
|
||||||
#define cudaHostRegisterPortable musaHostRegisterPortable
|
|
||||||
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
|
||||||
#define cudaHostUnregister musaHostUnregister
|
|
||||||
#define cudaLaunchHostFunc musaLaunchHostFunc
|
|
||||||
#define cudaMalloc musaMalloc
|
|
||||||
#define cudaMallocHost musaMallocHost
|
|
||||||
#define cudaMemcpy musaMemcpy
|
|
||||||
#define cudaMemcpyAsync musaMemcpyAsync
|
|
||||||
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
|
||||||
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
|
||||||
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
|
||||||
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
|
||||||
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
|
||||||
#define cudaMemcpyKind musaMemcpyKind
|
|
||||||
#define cudaMemset musaMemset
|
|
||||||
#define cudaMemsetAsync musaMemsetAsync
|
|
||||||
#define cudaMemGetInfo musaMemGetInfo
|
|
||||||
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
|
||||||
#define cudaSetDevice musaSetDevice
|
|
||||||
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
|
||||||
#define cudaStreamDestroy musaStreamDestroy
|
|
||||||
#define cudaStreamFireAndForget musaStreamFireAndForget
|
|
||||||
#define cudaStreamNonBlocking musaStreamNonBlocking
|
|
||||||
#define cudaStreamPerThread musaStreamPerThread
|
|
||||||
#define cudaStreamSynchronize musaStreamSynchronize
|
|
||||||
#define cudaStreamWaitEvent musaStreamWaitEvent
|
|
||||||
#define cudaStream_t musaStream_t
|
|
||||||
#define cudaSuccess musaSuccess
|
|
||||||
|
|
||||||
// XXX: Other CUDA => MUSA mapping
|
|
||||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
|
||||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
|
||||||
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
|
||||||
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
|
||||||
#define CUdevice MUdevice
|
|
||||||
#define CUdeviceptr MUdeviceptr
|
|
||||||
#define CUmemAccessDesc MUmemAccessDesc
|
|
||||||
#define CUmemAllocationProp MUmemAllocationProp
|
|
||||||
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
|
||||||
#define cuDeviceGet muDeviceGet
|
|
||||||
#define cuDeviceGetAttribute muDeviceGetAttribute
|
|
||||||
#define cuMemAddressFree muMemAddressFree
|
|
||||||
#define cuMemAddressReserve muMemAddressReserve
|
|
||||||
#define cuMemCreate muMemCreate
|
|
||||||
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
|
||||||
#define cuMemMap muMemMap
|
|
||||||
#define cuMemRelease muMemRelease
|
|
||||||
#define cuMemSetAccess muMemSetAccess
|
|
||||||
#define cuMemUnmap muMemUnmap
|
|
||||||
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
|
||||||
#define cudaFuncSetAttribute musaFuncSetAttribute
|
|
||||||
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
|
||||||
#define make_cudaExtent make_musaExtent
|
|
||||||
#define make_cudaPitchedPtr make_musaPitchedPtr
|
|
||||||
|
|
||||||
// XXX: USE_CUDA_GRAPH
|
|
||||||
#define CUDA_SUCCESS MUSA_SUCCESS
|
|
||||||
#define CUresult MUresult
|
|
||||||
#define cuGetErrorString muGetErrorString
|
|
||||||
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
|
||||||
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
|
||||||
#define cudaGraphDestroy musaGraphDestroy
|
|
||||||
#define cudaGraphExecDestroy musaGraphExecDestroy
|
|
||||||
#define cudaGraphExec_t musaGraphExec_t
|
|
||||||
#define cudaGraphExecUpdate musaGraphExecUpdate
|
|
||||||
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
|
||||||
#define cudaGraphGetNodes musaGraphGetNodes
|
|
||||||
#define cudaGraphInstantiate musaGraphInstantiate
|
|
||||||
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
|
||||||
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
|
||||||
#define cudaGraphLaunch musaGraphLaunch
|
|
||||||
#define cudaGraphNodeGetType musaGraphNodeGetType
|
|
||||||
#define cudaGraphNode_t musaGraphNode_t
|
|
||||||
#define cudaGraphNodeType musaGraphNodeType
|
|
||||||
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
|
||||||
#define cudaGraph_t musaGraph_t
|
|
||||||
#define cudaKernelNodeParams musaKernelNodeParams
|
|
||||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
|
||||||
#define cudaStreamEndCapture musaStreamEndCapture
|
|
||||||
|
|
||||||
// XXX: cuBLAS => muBLAS mapping
|
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
||||||
#define cublasComputeType_t cudaDataType_t
|
|
||||||
|
|
||||||
// XXX: Clang builtins mapping
|
|
||||||
#define __vsub4 __vsub4_musa
|
|
||||||
#define __vcmpeq4 __vcmpeq4_musa
|
|
||||||
#define __vcmpne4 __vcmpne4_musa
|
|
||||||
#else
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include "vendors/cuda.h"
|
||||||
#include <cuda.h>
|
|
||||||
#include <cublas_v2.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
|
|
||||||
#if CUDART_VERSION < 11020
|
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
||||||
#define cublasComputeType_t cudaDataType_t
|
|
||||||
#endif // CUDART_VERSION < 11020
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
|
@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
#ifndef GGML_USE_MUSA
|
|
||||||
return cublasGetStatusString(err);
|
return cublasGetStatusString(err);
|
||||||
#else
|
|
||||||
return mublasStatus_to_string(err);
|
|
||||||
#endif // GGML_USE_MUSA
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
|
@ -364,129 +116,7 @@ typedef half2 dfloat2;
|
||||||
#else
|
#else
|
||||||
typedef float dfloat; // dequantize float
|
typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
#if defined(GGML_USE_MUSA)
|
|
||||||
#ifndef __has_builtin
|
|
||||||
#define __has_builtin(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
|
||||||
return __vsubss4(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
#endif // defined(GGML_USE_MUSA)
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
|
||||||
#define __CUDA_ARCH__ 1300
|
|
||||||
|
|
||||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
||||||
defined(__gfx1150__) || defined(__gfx1151__)
|
|
||||||
#define RDNA3
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
|
||||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
|
||||||
#define RDNA2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
|
||||||
#define RDNA1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __has_builtin
|
|
||||||
#define __has_builtin(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
||||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
||||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
||||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
|
||||||
return reinterpret_cast<const int &>(c);
|
|
||||||
#else
|
|
||||||
int8x4_t c;
|
|
||||||
int16_t tmp;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
tmp = va[i] - vb[i];
|
|
||||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
|
||||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
|
||||||
c[i] = tmp;
|
|
||||||
}
|
|
||||||
return reinterpret_cast<int &>(c);
|
|
||||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
||||||
return __vsubss4(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
|
||||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
|
||||||
typedef union half2_b32 {
|
|
||||||
half2 val;
|
|
||||||
int b32;
|
|
||||||
} half2_b32_t;
|
|
||||||
half2_b32_t tmp;
|
|
||||||
tmp.val = var;
|
|
||||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
|
||||||
return tmp.val;
|
|
||||||
}
|
|
||||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
|
|
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 11020
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#endif // CUDART_VERSION < 11020
|
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
|
@ -0,0 +1,177 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
|
// for rocblas_initialize()
|
||||||
|
#include "rocblas/rocblas.h"
|
||||||
|
#endif // __HIP_PLATFORM_AMD__
|
||||||
|
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||||
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||||
|
#define cublasCreate hipblasCreate
|
||||||
|
#define cublasDestroy hipblasDestroy
|
||||||
|
#define cublasGemmEx hipblasGemmEx
|
||||||
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t hipblasHandle_t
|
||||||
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||||
|
#define cublasSetStream hipblasSetStream
|
||||||
|
#define cublasSgemm hipblasSgemm
|
||||||
|
#define cublasStatus_t hipblasStatus_t
|
||||||
|
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||||
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
|
#define cudaEventRecord hipEventRecord
|
||||||
|
#define cudaEventSynchronize hipEventSynchronize
|
||||||
|
#define cudaEvent_t hipEvent_t
|
||||||
|
#define cudaEventDestroy hipEventDestroy
|
||||||
|
#define cudaFree hipFree
|
||||||
|
#define cudaFreeHost hipHostFree
|
||||||
|
#define cudaGetDevice hipGetDevice
|
||||||
|
#define cudaGetDeviceCount hipGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
|
#define cudaGetErrorString hipGetErrorString
|
||||||
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaHostRegister hipHostRegister
|
||||||
|
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister hipHostUnregister
|
||||||
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
|
#define cudaMalloc hipMalloc
|
||||||
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#define cudaMemcpy hipMemcpy
|
||||||
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
|
#define cudaMemGetInfo hipMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice hipSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy hipStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread hipStreamPerThread
|
||||||
|
#define cudaStreamSynchronize hipStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||||
|
#define cudaStream_t hipStream_t
|
||||||
|
#define cudaSuccess hipSuccess
|
||||||
|
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||||
|
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||||
|
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||||
|
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||||
|
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||||
|
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||||
|
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||||
|
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||||
|
|
||||||
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
#define RDNA3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||||
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||||
|
#define RDNA2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||||
|
#define RDNA1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||||
|
return reinterpret_cast<const int &>(c);
|
||||||
|
#else
|
||||||
|
int8x4_t c;
|
||||||
|
int16_t tmp;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
tmp = va[i] - vb[i];
|
||||||
|
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||||
|
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||||
|
c[i] = tmp;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<int &>(c);
|
||||||
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||||
|
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||||
|
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||||
|
typedef union half2_b32 {
|
||||||
|
half2 val;
|
||||||
|
int b32;
|
||||||
|
} half2_b32_t;
|
||||||
|
half2_b32_t tmp;
|
||||||
|
tmp.val = var;
|
||||||
|
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||||
|
return tmp.val;
|
||||||
|
}
|
||||||
|
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
|
@ -0,0 +1,171 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <musa_runtime.h>
|
||||||
|
#include <musa.h>
|
||||||
|
#include <mublas.h>
|
||||||
|
#include <musa_fp16.h>
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||||
|
#define CUDA_R_16F MUSA_R_16F
|
||||||
|
#define CUDA_R_32F MUSA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#define cublasCreate mublasCreate
|
||||||
|
#define cublasDestroy mublasDestroy
|
||||||
|
#define cublasGemmEx mublasGemmEx
|
||||||
|
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t mublasHandle_t
|
||||||
|
#define cublasSetMathMode mublasSetMathMode
|
||||||
|
#define cublasSetStream mublasSetStream
|
||||||
|
#define cublasSgemm mublasSgemm
|
||||||
|
#define cublasStatus_t mublasStatus_t
|
||||||
|
#define cublasGetStatusString mublasStatus_to_string
|
||||||
|
#define cudaDataType_t musaDataType_t
|
||||||
|
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp musaDeviceProp
|
||||||
|
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||||
|
#define cudaError_t musaError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming musaEventDisableTiming
|
||||||
|
#define cudaEventRecord musaEventRecord
|
||||||
|
#define cudaEventSynchronize musaEventSynchronize
|
||||||
|
#define cudaEvent_t musaEvent_t
|
||||||
|
#define cudaEventDestroy musaEventDestroy
|
||||||
|
#define cudaFree musaFree
|
||||||
|
#define cudaFreeHost musaFreeHost
|
||||||
|
#define cudaGetDevice musaGetDevice
|
||||||
|
#define cudaGetDeviceCount musaGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||||
|
#define cudaGetErrorString musaGetErrorString
|
||||||
|
#define cudaGetLastError musaGetLastError
|
||||||
|
#define cudaHostRegister musaHostRegister
|
||||||
|
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister musaHostUnregister
|
||||||
|
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||||
|
#define cudaMalloc musaMalloc
|
||||||
|
#define cudaMallocHost musaMallocHost
|
||||||
|
#define cudaMemcpy musaMemcpy
|
||||||
|
#define cudaMemcpyAsync musaMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind musaMemcpyKind
|
||||||
|
#define cudaMemset musaMemset
|
||||||
|
#define cudaMemsetAsync musaMemsetAsync
|
||||||
|
#define cudaMemGetInfo musaMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice musaSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy musaStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread musaStreamPerThread
|
||||||
|
#define cudaStreamSynchronize musaStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||||
|
#define cudaStream_t musaStream_t
|
||||||
|
#define cudaSuccess musaSuccess
|
||||||
|
|
||||||
|
// Additional mappings for MUSA virtual memory pool
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||||
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||||
|
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||||
|
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||||
|
#define CUdevice MUdevice
|
||||||
|
#define CUdeviceptr MUdeviceptr
|
||||||
|
#define CUmemAccessDesc MUmemAccessDesc
|
||||||
|
#define CUmemAllocationProp MUmemAllocationProp
|
||||||
|
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||||
|
#define cuDeviceGet muDeviceGet
|
||||||
|
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||||
|
#define cuMemAddressFree muMemAddressFree
|
||||||
|
#define cuMemAddressReserve muMemAddressReserve
|
||||||
|
#define cuMemCreate muMemCreate
|
||||||
|
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||||
|
#define cuMemMap muMemMap
|
||||||
|
#define cuMemRelease muMemRelease
|
||||||
|
#define cuMemSetAccess muMemSetAccess
|
||||||
|
#define cuMemUnmap muMemUnmap
|
||||||
|
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||||
|
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||||
|
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||||
|
#define make_cudaExtent make_musaExtent
|
||||||
|
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||||
|
|
||||||
|
// Additional mappings for MUSA graphs
|
||||||
|
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||||
|
#define CUresult MUresult
|
||||||
|
#define cuGetErrorString muGetErrorString
|
||||||
|
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||||
|
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||||
|
#define cudaGraphDestroy musaGraphDestroy
|
||||||
|
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||||
|
#define cudaGraphExec_t musaGraphExec_t
|
||||||
|
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||||
|
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||||
|
#define cudaGraphGetNodes musaGraphGetNodes
|
||||||
|
#define cudaGraphInstantiate musaGraphInstantiate
|
||||||
|
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||||
|
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||||
|
#define cudaGraphLaunch musaGraphLaunch
|
||||||
|
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||||
|
#define cudaGraphNode_t musaGraphNode_t
|
||||||
|
#define cudaGraphNodeType musaGraphNodeType
|
||||||
|
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||||
|
#define cudaGraph_t musaGraph_t
|
||||||
|
#define cudaKernelNodeParams musaKernelNodeParams
|
||||||
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||||
|
#define cudaStreamEndCapture musaStreamEndCapture
|
||||||
|
|
||||||
|
// XXX: Clang builtins mapping
|
||||||
|
#define __vsub4 __vsub4_musa
|
||||||
|
#define __vcmpeq4 __vcmpeq4_musa
|
||||||
|
#define __vcmpne4 __vcmpne4_musa
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
|
@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
// compute mask for subtraction
|
// compute mask for subtraction
|
||||||
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
||||||
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
||||||
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
||||||
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
||||||
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
// load Q8 and take product with Q3
|
// load Q8 and take product with Q3
|
||||||
|
@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
||||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
||||||
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
||||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
||||||
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
||||||
|
|
|
@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
||||||
ggml_sycl_func_t func;
|
ggml_sycl_func_t func;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
|
func = ggml_sycl_op_conv_transpose_1d;
|
||||||
|
break;
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
func = ggml_sycl_repeat;
|
func = ggml_sycl_repeat;
|
||||||
break;
|
break;
|
||||||
|
@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
func = ggml_sycl_argsort;
|
func = ggml_sycl_argsort;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
func = ggml_sycl_op_timestep_embedding;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
|
{
|
||||||
|
ggml_type src0_type = op->src[0]->type;
|
||||||
|
ggml_type src1_type = op->src[1]->type;
|
||||||
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
} break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
#include "concat.hpp"
|
#include "concat.hpp"
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
#include "conv.hpp"
|
||||||
#include "convert.hpp"
|
#include "convert.hpp"
|
||||||
#include "dequantize.hpp"
|
#include "dequantize.hpp"
|
||||||
#include "dmmv.hpp"
|
#include "dmmv.hpp"
|
||||||
|
@ -23,5 +24,6 @@
|
||||||
#include "rope.hpp"
|
#include "rope.hpp"
|
||||||
#include "norm.hpp"
|
#include "norm.hpp"
|
||||||
#include "softmax.hpp"
|
#include "softmax.hpp"
|
||||||
|
#include "tsembd.hpp"
|
||||||
|
|
||||||
#endif // GGML_SYCL_BACKEND_HPP
|
#endif // GGML_SYCL_BACKEND_HPP
|
||||||
|
|
99
ggml/src/ggml-sycl/conv.cpp
Normal file
99
ggml/src/ggml-sycl/conv.cpp
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "conv.hpp"
|
||||||
|
|
||||||
|
static void conv_transpose_1d_kernel(
|
||||||
|
const int s0, const int output_size,
|
||||||
|
const int src0_ne0, const int src0_ne1, const int src0_ne2,
|
||||||
|
const int src1_ne0, const int dst_ne0,
|
||||||
|
const float * src0, const float * src1, float * dst,
|
||||||
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
int global_index = item_ct1.get_local_id(2) +
|
||||||
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||||
|
if (global_index >= output_size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int out_index = global_index / dst_ne0;
|
||||||
|
|
||||||
|
float accumulator = 0;
|
||||||
|
|
||||||
|
for (int c = 0; c < src0_ne2; c++) {
|
||||||
|
int idx = global_index % dst_ne0;
|
||||||
|
|
||||||
|
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
|
||||||
|
int input_offset = src1_ne0 * c;
|
||||||
|
|
||||||
|
for (int i = 0; i < src1_ne0; i++) {
|
||||||
|
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int weight_idx = idx - i*s0;
|
||||||
|
|
||||||
|
float kernel_weight = src0[kernel_offset + weight_idx];
|
||||||
|
float input_value = src1[input_offset+i];
|
||||||
|
|
||||||
|
accumulator += kernel_weight * input_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dst[global_index] = accumulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void conv_transpose_1d_f32_f32_sycl(
|
||||||
|
const int s0, const int output_size,
|
||||||
|
const int src0_ne0, const int src0_ne1, const int src0_ne2,
|
||||||
|
const int src1_ne0, const int dst_ne0,
|
||||||
|
const float *src0, const float *src1, float *dst,
|
||||||
|
const queue_ptr& stream) {
|
||||||
|
|
||||||
|
const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||||
|
const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
|
||||||
|
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(
|
||||||
|
block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
conv_transpose_1d_kernel(
|
||||||
|
s0, output_size,
|
||||||
|
src0_ne0, src0_ne1, src0_ne2,
|
||||||
|
src1_ne0, dst_ne0,
|
||||||
|
src0, src1, dst, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
const ggml_tensor *src1, ggml_tensor *dst) {
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
const float * src1_d = (const float *)src1->data;
|
||||||
|
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
dpct::queue_ptr stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
|
||||||
|
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||||
|
|
||||||
|
const int s0 = opts[0];
|
||||||
|
|
||||||
|
const int64_t output_size = ggml_nelements(dst);
|
||||||
|
|
||||||
|
conv_transpose_1d_f32_f32_sycl(s0, output_size,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||||
|
src1->ne[0], dst->ne[0],
|
||||||
|
src0_d, src1_d, dst_d, stream);
|
||||||
|
}
|
||||||
|
|
21
ggml/src/ggml-sycl/conv.hpp
Normal file
21
ggml/src/ggml-sycl/conv.hpp
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef GGML_SYCL_CONV_HPP
|
||||||
|
#define GGML_SYCL_CONV_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
const ggml_tensor *src1, ggml_tensor *dst);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_CONV_HPP
|
|
@ -41,6 +41,8 @@
|
||||||
#define SYCL_ACC_BLOCK_SIZE 256
|
#define SYCL_ACC_BLOCK_SIZE 256
|
||||||
#define SYCL_IM2COL_BLOCK_SIZE 256
|
#define SYCL_IM2COL_BLOCK_SIZE 256
|
||||||
#define SYCL_POOL2D_BLOCK_SIZE 256
|
#define SYCL_POOL2D_BLOCK_SIZE 256
|
||||||
|
#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||||
|
#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
||||||
|
|
||||||
// dmmv = dequantize_mul_mat_vec
|
// dmmv = dequantize_mul_mat_vec
|
||||||
#ifndef GGML_SYCL_DMMV_X
|
#ifndef GGML_SYCL_DMMV_X
|
||||||
|
|
71
ggml/src/ggml-sycl/tsembd.cpp
Normal file
71
ggml/src/ggml-sycl/tsembd.cpp
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "tsembd.hpp"
|
||||||
|
|
||||||
|
static void timestep_embedding_f32(
|
||||||
|
const float * timesteps, float * dst, const int nb1,
|
||||||
|
const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
|
||||||
|
// item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
|
||||||
|
// item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
|
||||||
|
int i = item_ct1.get_group(1);
|
||||||
|
int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||||
|
float * embed_data = (float *)((char *)dst + i*nb1);
|
||||||
|
|
||||||
|
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
||||||
|
embed_data[dim] = 0.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
int half = dim / 2;
|
||||||
|
if (j >= half) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float timestep = timesteps[i];
|
||||||
|
float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
|
||||||
|
float arg = timestep * freq;
|
||||||
|
embed_data[j] = sycl::cos(arg);
|
||||||
|
embed_data[j + half] = sycl::sin(arg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void timestep_embedding_f32_sycl(
|
||||||
|
const float * x, float * dst, const int ne00, const int nb1,
|
||||||
|
const int dim, const int max_period, const queue_ptr& stream) {
|
||||||
|
// As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
|
||||||
|
int half_ceil = dim / 2;
|
||||||
|
int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
||||||
|
sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
|
||||||
|
sycl::range<3> gridDim(1, ne00, num_blocks);
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(
|
||||||
|
gridDim * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
timestep_embedding_f32(
|
||||||
|
x, dst, nb1, dim, max_period, item_ct1
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
const ggml_tensor *src1, ggml_tensor * dst) {
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
dpct::queue_ptr stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int dim = dst->op_params[0];
|
||||||
|
const int max_period = dst->op_params[1];
|
||||||
|
|
||||||
|
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
||||||
|
}
|
21
ggml/src/ggml-sycl/tsembd.hpp
Normal file
21
ggml/src/ggml-sycl/tsembd.hpp
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef GGML_SYCL_TSEMBD_HPP
|
||||||
|
#define GGML_SYCL_TSEMBD_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
const ggml_tensor *src1, ggml_tensor * dst);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_TSEMBD_HPP
|
|
@ -141,7 +141,51 @@ typedef pthread_t ggml_thread_t;
|
||||||
|
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
|
||||||
#if defined(__linux__)
|
#if defined(__ANDROID__)
|
||||||
|
#include <unwind.h>
|
||||||
|
#include <dlfcn.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
struct backtrace_state {
|
||||||
|
void ** current;
|
||||||
|
void ** end;
|
||||||
|
};
|
||||||
|
|
||||||
|
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
|
||||||
|
struct backtrace_state * state = (struct backtrace_state *)arg;
|
||||||
|
uintptr_t pc = _Unwind_GetIP(context);
|
||||||
|
if (pc) {
|
||||||
|
if (state->current == state->end) {
|
||||||
|
return _URC_END_OF_STACK;
|
||||||
|
} else {
|
||||||
|
*state->current++ = (void*)pc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return _URC_NO_REASON;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_print_backtrace_symbols(void) {
|
||||||
|
const int max = 100;
|
||||||
|
void* buffer[max];
|
||||||
|
|
||||||
|
struct backtrace_state state = {buffer, buffer + max};
|
||||||
|
_Unwind_Backtrace(unwind_callback, &state);
|
||||||
|
|
||||||
|
int count = state.current - buffer;
|
||||||
|
|
||||||
|
for (int idx = 0; idx < count; ++idx) {
|
||||||
|
const void * addr = buffer[idx];
|
||||||
|
const char * symbol = "";
|
||||||
|
|
||||||
|
Dl_info info;
|
||||||
|
if (dladdr(addr, &info) && info.dli_sname) {
|
||||||
|
symbol = info.dli_sname;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#elif defined(__linux__)
|
||||||
#include <execinfo.h>
|
#include <execinfo.h>
|
||||||
static void ggml_print_backtrace_symbols(void) {
|
static void ggml_print_backtrace_symbols(void) {
|
||||||
void * trace[100];
|
void * trace[100];
|
||||||
|
|
|
@ -312,6 +312,8 @@ class GGUFWriter:
|
||||||
self.add_key_value(key, val, GGUFValueType.STRING)
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
|
if len(val) == 0:
|
||||||
|
return
|
||||||
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -845,7 +847,14 @@ class GGUFWriter:
|
||||||
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
||||||
kv_data += self._pack("Q", len(encoded_val))
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
kv_data += encoded_val
|
kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY:
|
||||||
|
|
||||||
|
if not isinstance(val, Sequence):
|
||||||
|
raise ValueError("Invalid GGUF metadata array, expecting sequence")
|
||||||
|
|
||||||
|
if len(val) == 0:
|
||||||
|
raise ValueError("Invalid GGUF metadata array. Empty array")
|
||||||
|
|
||||||
if isinstance(val, bytes):
|
if isinstance(val, bytes):
|
||||||
ltype = GGUFValueType.UINT8
|
ltype = GGUFValueType.UINT8
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue