Merge branch 'master' into compilade/batch-splits
This commit is contained in:
commit
5679a3bdbb
44 changed files with 968 additions and 528 deletions
|
@ -207,6 +207,7 @@ set(GGML_PUBLIC_HEADERS
|
|||
include/ggml-alloc.h
|
||||
include/ggml-backend.h
|
||||
include/ggml-blas.h
|
||||
include/ggml-cann.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml.h
|
||||
include/ggml-kompute.h
|
||||
|
|
|
@ -349,6 +349,7 @@ extern "C" {
|
|||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
|
||||
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||
|
||||
struct ggml_object;
|
||||
|
|
|
@ -849,11 +849,6 @@ if (GGML_CANN)
|
|||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
# TODO: find libs
|
||||
link_directories(
|
||||
${CANN_INSTALL_DIR}/lib64
|
||||
)
|
||||
|
||||
add_subdirectory(ggml-cann/kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
|
@ -872,6 +867,7 @@ if (GGML_CANN)
|
|||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||
set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||
endif()
|
||||
else()
|
||||
|
|
|
@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntw() == 8) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
|
@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntw() == 8) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
|
@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
|
@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
return;
|
||||
}
|
||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||
"performance");
|
||||
}
|
||||
else if (ggml_cpu_has_neon()) {
|
||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
||||
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||
"quantization format for optimal performance");
|
||||
}
|
||||
|
@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (svcntw() == 8) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
|
@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (svcntw() == 8) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
|
@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
|
@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
return;
|
||||
}
|
||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
||||
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||
"performance");
|
||||
}
|
||||
else if (ggml_cpu_has_neon()) {
|
||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
||||
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||
"quantization format for optimal performance");
|
||||
}
|
||||
|
|
|
@ -1312,6 +1312,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst,
|
||||
ggml_tensor* src1,
|
||||
aclTensor* tmp_cast_tensor,
|
||||
aclTensor* tmp_im2col_tensor) {
|
||||
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
||||
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
||||
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
||||
|
||||
int64_t permute_dim[] = {0, 2, 1};
|
||||
if (src1->type != dst->type) {
|
||||
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
||||
} else {
|
||||
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
||||
}
|
||||
|
||||
// release
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
static void ggml_cann_im2col_1d_post_process(
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
|
||||
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
|
||||
const std::vector<int64_t>& im2col_op_params) {
|
||||
// get params
|
||||
const int64_t KH = im2col_op_params[0];
|
||||
const int64_t KW = im2col_op_params[1];
|
||||
const int64_t IW = im2col_op_params[2];
|
||||
const int64_t IC = im2col_op_params[3];
|
||||
const int64_t N = im2col_op_params[4];
|
||||
const int64_t OH = im2col_op_params[5];
|
||||
const int64_t OW = im2col_op_params[6];
|
||||
const int64_t s0 = im2col_op_params[7];
|
||||
const int64_t p0 = im2col_op_params[8];
|
||||
const int64_t d0 = im2col_op_params[9];
|
||||
const int64_t n_bytes_factor = im2col_op_params[10];
|
||||
|
||||
// Permute: [N, IC * KH * KW, OW * OH] ->
|
||||
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
||||
aclTensor* tmp_permute_tensor = nullptr;
|
||||
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
||||
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
||||
|
||||
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
||||
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
||||
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
||||
}
|
||||
|
||||
tmp_permute_tensor = ggml_cann_create_tensor(
|
||||
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
|
||||
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
||||
|
||||
int64_t permute_dim[] = {0, 2, 1};
|
||||
if (src1->type != dst->type) {
|
||||
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
||||
} else {
|
||||
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
||||
3);
|
||||
}
|
||||
|
||||
// number of times the kernel moves in W dimension
|
||||
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
||||
size_t offset;
|
||||
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
||||
|
||||
// memory copy with offset to restore 1D im2col from 2d
|
||||
if (IC > 1) {
|
||||
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
|
||||
|
||||
for (int c = 0; c < IC; c++) {
|
||||
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
||||
KH * KW * c * ggml_type_size(dst->type);
|
||||
cur_dst_buffer = (char*)dst->data +
|
||||
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||
|
||||
for (int i = 0; i < n_step_w; i++) {
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
cur_dst_buffer =
|
||||
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
||||
cur_permute_buffer = (char*)cur_permute_buffer +
|
||||
KH * KW * IC * ggml_type_size(dst->type);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
offset = KH * KW * n_step_w *
|
||||
ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
|
||||
(char*)tmp_permute_buffer + offset, offset,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
}
|
||||
|
||||
// release
|
||||
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
|
||||
}
|
||||
|
||||
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // kernel
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
@ -1320,21 +1425,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
||||
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int64_t N = is_2D ? ne13 : ne12;
|
||||
const int64_t IC = is_2D ? ne12 : ne11;
|
||||
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
||||
// im2col and do post-processing to restore it to 1D.
|
||||
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
|
||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
|
||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
|
||||
|
||||
const int64_t KH = is_2D ? ne01 : 1;
|
||||
const int64_t N = ne13;
|
||||
const int64_t IC = ne12;
|
||||
const int64_t KH = ne01;
|
||||
const int64_t KW = ne00;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
const int64_t OH = is_2D ? ne2 : 1;
|
||||
const int64_t OW = ne1;
|
||||
|
@ -1342,9 +1449,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
|
||||
// memory allocated increased to 3x when is_2D == false
|
||||
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
||||
|
||||
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
|
||||
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
|
||||
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
||||
|
||||
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
||||
|
@ -1356,8 +1466,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
||||
// dst.elemcount.
|
||||
ggml_cann_pool_alloc im2col_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
||||
void* tmp_im2col_buffer = im2col_allocator.get();
|
||||
|
||||
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
||||
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
||||
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
||||
|
@ -1380,8 +1492,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
paddings, strides, tmp_im2col_tensor,
|
||||
&workspaceSize, &executor));
|
||||
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspace_allocator.alloc(workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
|
@ -1391,9 +1504,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
// Cast if dst is f16.
|
||||
aclTensor* tmp_cast_tensor = nullptr;
|
||||
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
||||
void* tmp_cast_buffer = nullptr;
|
||||
if (src1->type != dst->type) {
|
||||
tmp_cast_allocator.alloc(ggml_nbytes(dst));
|
||||
void* tmp_cast_buffer = tmp_cast_allocator.get();
|
||||
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||
tmp_cast_buffer = tmp_cast_allocator.get();
|
||||
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
||||
temp_cast_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||
|
@ -1408,24 +1522,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
|
||||
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
||||
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
||||
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
||||
|
||||
int64_t permute_dim[] = {0, 2, 1};
|
||||
if (src1->type != dst->type) {
|
||||
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
||||
// post-processing
|
||||
if (is_2D) {
|
||||
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||
tmp_im2col_tensor);
|
||||
} else {
|
||||
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
||||
std::vector<int64_t> im2col_op_params = {
|
||||
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
|
||||
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||
tmp_im2col_tensor, im2col_op_params);
|
||||
}
|
||||
|
||||
// release
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
||||
ACL_CHECK(aclDestroyIntArray(dilations));
|
||||
ACL_CHECK(aclDestroyIntArray(paddings));
|
||||
|
@ -2381,10 +2492,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
|||
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
||||
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
||||
|
||||
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||
ggml_cann_pool_alloc input_alloctor(
|
||||
ctx.pool(), ggml_nelements(src1) * input_elem_size);
|
||||
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
input_buffer = input_alloctor.get();
|
||||
|
||||
int64_t* input_cast_ne = src1->ne;
|
||||
|
|
|
@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|||
}
|
||||
return res;
|
||||
#else
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
cudaError_t err;
|
||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||
{
|
||||
err = cudaMallocManaged(ptr, size);
|
||||
}
|
||||
else
|
||||
{
|
||||
err = cudaMalloc(ptr, size);
|
||||
}
|
||||
return err;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1885,10 +1900,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||
&& src1->ne[1] == 1;
|
||||
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
|
|
@ -27,255 +27,11 @@
|
|||
#include <vector>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEventSynchronize hipEventSynchronize
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
#define cudaFree hipFree
|
||||
#define cudaFreeHost hipHostFree
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#define cudaGetLastError hipGetLastError
|
||||
#define cudaHostRegister hipHostRegister
|
||||
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||
#define cudaHostUnregister hipHostUnregister
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamDestroy hipStreamDestroy
|
||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
#include "vendors/hip.h"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#include <musa_runtime.h>
|
||||
#include <musa.h>
|
||||
#include <mublas.h>
|
||||
#include <musa_fp16.h>
|
||||
// XXX: Keep the following order the same as hipBLAS
|
||||
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
|
||||
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
// #define cublasComputeType_t mublasComputeType_t
|
||||
#define cublasCreate mublasCreate
|
||||
#define cublasDestroy mublasDestroy
|
||||
#define cublasGemmEx mublasGemmEx
|
||||
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||
#define cublasHandle_t mublasHandle_t
|
||||
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetMathMode mublasSetMathMode
|
||||
#define cublasSetStream mublasSetStream
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
#define cudaEventDisableTiming musaEventDisableTiming
|
||||
#define cudaEventRecord musaEventRecord
|
||||
#define cudaEventSynchronize musaEventSynchronize
|
||||
#define cudaEvent_t musaEvent_t
|
||||
#define cudaEventDestroy musaEventDestroy
|
||||
#define cudaFree musaFree
|
||||
#define cudaFreeHost musaFreeHost
|
||||
#define cudaGetDevice musaGetDevice
|
||||
#define cudaGetDeviceCount musaGetDeviceCount
|
||||
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||
#define cudaGetErrorString musaGetErrorString
|
||||
#define cudaGetLastError musaGetLastError
|
||||
#define cudaHostRegister musaHostRegister
|
||||
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||
#define cudaHostUnregister musaHostUnregister
|
||||
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||
#define cudaMalloc musaMalloc
|
||||
#define cudaMallocHost musaMallocHost
|
||||
#define cudaMemcpy musaMemcpy
|
||||
#define cudaMemcpyAsync musaMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||
#define cudaMemcpyKind musaMemcpyKind
|
||||
#define cudaMemset musaMemset
|
||||
#define cudaMemsetAsync musaMemsetAsync
|
||||
#define cudaMemGetInfo musaMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice musaSetDevice
|
||||
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||
#define cudaStreamDestroy musaStreamDestroy
|
||||
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||
#define cudaStreamPerThread musaStreamPerThread
|
||||
#define cudaStreamSynchronize musaStreamSynchronize
|
||||
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||
#define cudaStream_t musaStream_t
|
||||
#define cudaSuccess musaSuccess
|
||||
|
||||
// XXX: Other CUDA => MUSA mapping
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||
#define CUdevice MUdevice
|
||||
#define CUdeviceptr MUdeviceptr
|
||||
#define CUmemAccessDesc MUmemAccessDesc
|
||||
#define CUmemAllocationProp MUmemAllocationProp
|
||||
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||
#define cuDeviceGet muDeviceGet
|
||||
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||
#define cuMemAddressFree muMemAddressFree
|
||||
#define cuMemAddressReserve muMemAddressReserve
|
||||
#define cuMemCreate muMemCreate
|
||||
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||
#define cuMemMap muMemMap
|
||||
#define cuMemRelease muMemRelease
|
||||
#define cuMemSetAccess muMemSetAccess
|
||||
#define cuMemUnmap muMemUnmap
|
||||
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||
#define make_cudaExtent make_musaExtent
|
||||
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||
|
||||
// XXX: USE_CUDA_GRAPH
|
||||
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||
#define CUresult MUresult
|
||||
#define cuGetErrorString muGetErrorString
|
||||
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||
#define cudaGraphDestroy musaGraphDestroy
|
||||
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||
#define cudaGraphExec_t musaGraphExec_t
|
||||
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||
#define cudaGraphGetNodes musaGraphGetNodes
|
||||
#define cudaGraphInstantiate musaGraphInstantiate
|
||||
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||
#define cudaGraphLaunch musaGraphLaunch
|
||||
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||
#define cudaGraphNode_t musaGraphNode_t
|
||||
#define cudaGraphNodeType musaGraphNodeType
|
||||
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||
#define cudaGraph_t musaGraph_t
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: cuBLAS => muBLAS mapping
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
#include "vendors/musa.h"
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#if CUDART_VERSION < 11020
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#endif // CUDART_VERSION < 11020
|
||||
|
||||
#include "vendors/cuda.h"
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
|
@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|||
|
||||
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||
#ifndef GGML_USE_MUSA
|
||||
return cublasGetStatusString(err);
|
||||
#else
|
||||
return mublasStatus_to_string(err);
|
||||
#endif // GGML_USE_MUSA
|
||||
}
|
||||
#else
|
||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||
|
@ -364,129 +116,7 @@ typedef half2 dfloat2;
|
|||
#else
|
||||
typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif //GGML_CUDA_F16
|
||||
|
||||
#if defined(GGML_USE_MUSA)
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
#endif // defined(GGML_USE_MUSA)
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#define RDNA3
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||
#define RDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||
#define RDNA1
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||
return reinterpret_cast<const int &>(c);
|
||||
#else
|
||||
int8x4_t c;
|
||||
int16_t tmp;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp = va[i] - vb[i];
|
||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||
c[i] = tmp;
|
||||
}
|
||||
return reinterpret_cast<int &>(c);
|
||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
half2 val;
|
||||
int b32;
|
||||
} half2_b32_t;
|
||||
half2_b32_t tmp;
|
||||
tmp.val = var;
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
|
|
|
@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
|
@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
|
@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
|
@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
|
@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
|
@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|||
}
|
||||
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
|
@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||
GGML_UNUSED(src1_ncols);
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
|
||||
return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
|
||||
src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
|
||||
src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
|
||||
src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
|
||||
src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
|
||||
src0_type == GGML_TYPE_F16;
|
||||
}
|
||||
|
|
|
@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
|
||||
|
|
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#if CUDART_VERSION < 11020
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#endif // CUDART_VERSION < 11020
|
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
|
@ -0,0 +1,177 @@
|
|||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEventSynchronize hipEventSynchronize
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
#define cudaFree hipFree
|
||||
#define cudaFreeHost hipHostFree
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#define cudaGetLastError hipGetLastError
|
||||
#define cudaHostRegister hipHostRegister
|
||||
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||
#define cudaHostUnregister hipHostUnregister
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamDestroy hipStreamDestroy
|
||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#define RDNA3
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||
#define RDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||
#define RDNA1
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||
return reinterpret_cast<const int &>(c);
|
||||
#else
|
||||
int8x4_t c;
|
||||
int16_t tmp;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp = va[i] - vb[i];
|
||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||
c[i] = tmp;
|
||||
}
|
||||
return reinterpret_cast<int &>(c);
|
||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
half2 val;
|
||||
int b32;
|
||||
} half2_b32_t;
|
||||
half2_b32_t tmp;
|
||||
tmp.val = var;
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
|
@ -0,0 +1,171 @@
|
|||
#pragma once
|
||||
|
||||
#include <musa_runtime.h>
|
||||
#include <musa.h>
|
||||
#include <mublas.h>
|
||||
#include <musa_fp16.h>
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#define cublasCreate mublasCreate
|
||||
#define cublasDestroy mublasDestroy
|
||||
#define cublasGemmEx mublasGemmEx
|
||||
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||
#define cublasHandle_t mublasHandle_t
|
||||
#define cublasSetMathMode mublasSetMathMode
|
||||
#define cublasSetStream mublasSetStream
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cublasGetStatusString mublasStatus_to_string
|
||||
#define cudaDataType_t musaDataType_t
|
||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
#define cudaEventDisableTiming musaEventDisableTiming
|
||||
#define cudaEventRecord musaEventRecord
|
||||
#define cudaEventSynchronize musaEventSynchronize
|
||||
#define cudaEvent_t musaEvent_t
|
||||
#define cudaEventDestroy musaEventDestroy
|
||||
#define cudaFree musaFree
|
||||
#define cudaFreeHost musaFreeHost
|
||||
#define cudaGetDevice musaGetDevice
|
||||
#define cudaGetDeviceCount musaGetDeviceCount
|
||||
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||
#define cudaGetErrorString musaGetErrorString
|
||||
#define cudaGetLastError musaGetLastError
|
||||
#define cudaHostRegister musaHostRegister
|
||||
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||
#define cudaHostUnregister musaHostUnregister
|
||||
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||
#define cudaMalloc musaMalloc
|
||||
#define cudaMallocHost musaMallocHost
|
||||
#define cudaMemcpy musaMemcpy
|
||||
#define cudaMemcpyAsync musaMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||
#define cudaMemcpyKind musaMemcpyKind
|
||||
#define cudaMemset musaMemset
|
||||
#define cudaMemsetAsync musaMemsetAsync
|
||||
#define cudaMemGetInfo musaMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice musaSetDevice
|
||||
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||
#define cudaStreamDestroy musaStreamDestroy
|
||||
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||
#define cudaStreamPerThread musaStreamPerThread
|
||||
#define cudaStreamSynchronize musaStreamSynchronize
|
||||
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||
#define cudaStream_t musaStream_t
|
||||
#define cudaSuccess musaSuccess
|
||||
|
||||
// Additional mappings for MUSA virtual memory pool
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||
#define CUdevice MUdevice
|
||||
#define CUdeviceptr MUdeviceptr
|
||||
#define CUmemAccessDesc MUmemAccessDesc
|
||||
#define CUmemAllocationProp MUmemAllocationProp
|
||||
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||
#define cuDeviceGet muDeviceGet
|
||||
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||
#define cuMemAddressFree muMemAddressFree
|
||||
#define cuMemAddressReserve muMemAddressReserve
|
||||
#define cuMemCreate muMemCreate
|
||||
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||
#define cuMemMap muMemMap
|
||||
#define cuMemRelease muMemRelease
|
||||
#define cuMemSetAccess muMemSetAccess
|
||||
#define cuMemUnmap muMemUnmap
|
||||
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||
#define make_cudaExtent make_musaExtent
|
||||
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||
|
||||
// Additional mappings for MUSA graphs
|
||||
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||
#define CUresult MUresult
|
||||
#define cuGetErrorString muGetErrorString
|
||||
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||
#define cudaGraphDestroy musaGraphDestroy
|
||||
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||
#define cudaGraphExec_t musaGraphExec_t
|
||||
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||
#define cudaGraphGetNodes musaGraphGetNodes
|
||||
#define cudaGraphInstantiate musaGraphInstantiate
|
||||
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||
#define cudaGraphLaunch musaGraphLaunch
|
||||
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||
#define cudaGraphNode_t musaGraphNode_t
|
||||
#define cudaGraphNodeType musaGraphNodeType
|
||||
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||
#define cudaGraph_t musaGraph_t
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
|
@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
|||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
|
@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
||||
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
@ -146,6 +143,7 @@ extern "C" {
|
|||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
|
|
|
@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
float sumf = 0;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntb() == QK8_0) {
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
|
@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
float sumf = 0;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (svcntb() == QK8_0) {
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
|
@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
// compute mask for subtraction
|
||||
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
||||
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
||||
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
||||
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
||||
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
||||
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
||||
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
||||
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
||||
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
|
||||
m <<= 1;
|
||||
|
||||
// load Q8 and take product with Q3
|
||||
|
@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|||
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
||||
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
||||
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
||||
m <<= 1;
|
||||
|
||||
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
||||
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
||||
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
||||
m <<= 1;
|
||||
|
||||
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
||||
|
|
|
@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
|
|||
void iq3xs_init_impl(int grid_size);
|
||||
void iq3xs_free_impl(int grid_size);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
extern int ggml_sve_cnt_b;
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|||
ggml_sycl_func_t func;
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
func = ggml_sycl_op_conv_transpose_1d;
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
func = ggml_sycl_repeat;
|
||||
break;
|
||||
|
@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|||
case GGML_OP_ARGSORT:
|
||||
func = ggml_sycl_argsort;
|
||||
break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
func = ggml_sycl_op_timestep_embedding;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|||
|
||||
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
|
@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
#include "conv.hpp"
|
||||
#include "convert.hpp"
|
||||
#include "dequantize.hpp"
|
||||
#include "dmmv.hpp"
|
||||
|
@ -23,5 +24,6 @@
|
|||
#include "rope.hpp"
|
||||
#include "norm.hpp"
|
||||
#include "softmax.hpp"
|
||||
#include "tsembd.hpp"
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
|
99
ggml/src/ggml-sycl/conv.cpp
Normal file
99
ggml/src/ggml-sycl/conv.cpp
Normal file
|
@ -0,0 +1,99 @@
|
|||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "conv.hpp"
|
||||
|
||||
static void conv_transpose_1d_kernel(
|
||||
const int s0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2,
|
||||
const int src1_ne0, const int dst_ne0,
|
||||
const float * src0, const float * src1, float * dst,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int global_index = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (global_index >= output_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int out_index = global_index / dst_ne0;
|
||||
|
||||
float accumulator = 0;
|
||||
|
||||
for (int c = 0; c < src0_ne2; c++) {
|
||||
int idx = global_index % dst_ne0;
|
||||
|
||||
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
|
||||
int input_offset = src1_ne0 * c;
|
||||
|
||||
for (int i = 0; i < src1_ne0; i++) {
|
||||
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
|
||||
continue;
|
||||
}
|
||||
int weight_idx = idx - i*s0;
|
||||
|
||||
float kernel_weight = src0[kernel_offset + weight_idx];
|
||||
float input_value = src1[input_offset+i];
|
||||
|
||||
accumulator += kernel_weight * input_value;
|
||||
}
|
||||
}
|
||||
dst[global_index] = accumulator;
|
||||
}
|
||||
|
||||
static void conv_transpose_1d_f32_f32_sycl(
|
||||
const int s0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2,
|
||||
const int src1_ne0, const int dst_ne0,
|
||||
const float *src0, const float *src1, float *dst,
|
||||
const queue_ptr& stream) {
|
||||
|
||||
const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
|
||||
const sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
conv_transpose_1d_kernel(
|
||||
s0, output_size,
|
||||
src0_ne0, src0_ne1, src0_ne2,
|
||||
src1_ne0, dst_ne0,
|
||||
src0, src1, dst, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst) {
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
|
||||
const int s0 = opts[0];
|
||||
|
||||
const int64_t output_size = ggml_nelements(dst);
|
||||
|
||||
conv_transpose_1d_f32_f32_sycl(s0, output_size,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
src1->ne[0], dst->ne[0],
|
||||
src0_d, src1_d, dst_d, stream);
|
||||
}
|
||||
|
21
ggml/src/ggml-sycl/conv.hpp
Normal file
21
ggml/src/ggml-sycl/conv.hpp
Normal file
|
@ -0,0 +1,21 @@
|
|||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_CONV_HPP
|
||||
#define GGML_SYCL_CONV_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst);
|
||||
|
||||
#endif // GGML_SYCL_CONV_HPP
|
|
@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
|||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -41,6 +41,8 @@
|
|||
#define SYCL_ACC_BLOCK_SIZE 256
|
||||
#define SYCL_IM2COL_BLOCK_SIZE 256
|
||||
#define SYCL_POOL2D_BLOCK_SIZE 256
|
||||
#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
||||
|
||||
// dmmv = dequantize_mul_mat_vec
|
||||
#ifndef GGML_SYCL_DMMV_X
|
||||
|
|
71
ggml/src/ggml-sycl/tsembd.cpp
Normal file
71
ggml/src/ggml-sycl/tsembd.cpp
Normal file
|
@ -0,0 +1,71 @@
|
|||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "tsembd.hpp"
|
||||
|
||||
static void timestep_embedding_f32(
|
||||
const float * timesteps, float * dst, const int nb1,
|
||||
const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
|
||||
// item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
|
||||
// item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
|
||||
int i = item_ct1.get_group(1);
|
||||
int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
float * embed_data = (float *)((char *)dst + i*nb1);
|
||||
|
||||
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
||||
embed_data[dim] = 0.f;
|
||||
}
|
||||
|
||||
int half = dim / 2;
|
||||
if (j >= half) {
|
||||
return;
|
||||
}
|
||||
|
||||
float timestep = timesteps[i];
|
||||
float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
|
||||
float arg = timestep * freq;
|
||||
embed_data[j] = sycl::cos(arg);
|
||||
embed_data[j + half] = sycl::sin(arg);
|
||||
}
|
||||
|
||||
static void timestep_embedding_f32_sycl(
|
||||
const float * x, float * dst, const int ne00, const int nb1,
|
||||
const int dim, const int max_period, const queue_ptr& stream) {
|
||||
// As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
|
||||
int half_ceil = dim / 2;
|
||||
int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
||||
sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
|
||||
sycl::range<3> gridDim(1, ne00, num_blocks);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
gridDim * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
timestep_embedding_f32(
|
||||
x, dst, nb1, dim, max_period, item_ct1
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor * dst) {
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int dim = dst->op_params[0];
|
||||
const int max_period = dst->op_params[1];
|
||||
|
||||
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
||||
}
|
21
ggml/src/ggml-sycl/tsembd.hpp
Normal file
21
ggml/src/ggml-sycl/tsembd.hpp
Normal file
|
@ -0,0 +1,21 @@
|
|||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_TSEMBD_HPP
|
||||
#define GGML_SYCL_TSEMBD_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_TSEMBD_HPP
|
|
@ -37,6 +37,9 @@
|
|||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
int ggml_sve_cnt_b = 0;
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
@ -141,7 +144,51 @@ typedef pthread_t ggml_thread_t;
|
|||
|
||||
#include <sys/wait.h>
|
||||
|
||||
#if defined(__linux__)
|
||||
#if defined(__ANDROID__)
|
||||
#include <unwind.h>
|
||||
#include <dlfcn.h>
|
||||
#include <stdio.h>
|
||||
|
||||
struct backtrace_state {
|
||||
void ** current;
|
||||
void ** end;
|
||||
};
|
||||
|
||||
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
|
||||
struct backtrace_state * state = (struct backtrace_state *)arg;
|
||||
uintptr_t pc = _Unwind_GetIP(context);
|
||||
if (pc) {
|
||||
if (state->current == state->end) {
|
||||
return _URC_END_OF_STACK;
|
||||
} else {
|
||||
*state->current++ = (void*)pc;
|
||||
}
|
||||
}
|
||||
return _URC_NO_REASON;
|
||||
}
|
||||
|
||||
static void ggml_print_backtrace_symbols(void) {
|
||||
const int max = 100;
|
||||
void* buffer[max];
|
||||
|
||||
struct backtrace_state state = {buffer, buffer + max};
|
||||
_Unwind_Backtrace(unwind_callback, &state);
|
||||
|
||||
int count = state.current - buffer;
|
||||
|
||||
for (int idx = 0; idx < count; ++idx) {
|
||||
const void * addr = buffer[idx];
|
||||
const char * symbol = "";
|
||||
|
||||
Dl_info info;
|
||||
if (dladdr(addr, &info) && info.dli_sname) {
|
||||
symbol = info.dli_sname;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
||||
}
|
||||
}
|
||||
#elif defined(__linux__) && defined(__GLIBC__)
|
||||
#include <execinfo.h>
|
||||
static void ggml_print_backtrace_symbols(void) {
|
||||
void * trace[100];
|
||||
|
@ -436,9 +483,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = ggml_compute_fp32_to_bf16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
int i = 0;
|
||||
#if defined(__AVX512BF16__)
|
||||
// subnormals are flushed to zero on this platform
|
||||
for (; i + 32 <= n; i += 32) {
|
||||
_mm512_storeu_si512(
|
||||
(__m512i *)(y + i),
|
||||
|
@ -918,7 +972,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
|
@ -3507,6 +3561,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
|
||||
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (!ggml_sve_cnt_b) {
|
||||
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||
|
||||
ggml_critical_section_end();
|
||||
|
@ -20513,7 +20573,7 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_BF16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_bf16_t);
|
||||
ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
|
||||
ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
|
||||
result = n * elemsize;
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
|
|
|
@ -30,6 +30,20 @@
|
|||
|
||||
#define ASYNCIO_CONCURRENCY 64
|
||||
|
||||
// define prototypes
|
||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
|
||||
bool directory_exists(const std::string& path);
|
||||
bool create_directory(const std::string& path);
|
||||
std::string to_uppercase(const std::string& input);
|
||||
bool string_ends_with(const std::string& str, const std::string& suffix);
|
||||
std::string join_paths(const std::string& path1, const std::string& path2);
|
||||
std::string basename(const std::string &path);
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
|
||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
|
||||
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
|
||||
void process_shaders(std::vector<std::future<void>>& tasks);
|
||||
void write_output_files();
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
|
||||
|
@ -38,7 +52,7 @@ std::string input_dir = "vulkan-shaders";
|
|||
std::string output_dir = "/tmp";
|
||||
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
||||
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
||||
bool no_clean = false;
|
||||
bool clean = true;
|
||||
|
||||
const std::vector<std::string> type_names = {
|
||||
"f32",
|
||||
|
@ -464,8 +478,9 @@ void write_output_files() {
|
|||
}
|
||||
fprintf(src, "\n};\n\n");
|
||||
|
||||
if (!no_clean) {
|
||||
if (clean) {
|
||||
std::remove(path.c_str());
|
||||
// fprintf(stderr, "Removed: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -481,6 +496,18 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (argc <= 1 || args.find("--help") != args.end()) {
|
||||
std::cout << "Usage:\n"
|
||||
"\tvulkan-shaders-gen [options]\n\n"
|
||||
"Options:\n"
|
||||
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
|
||||
"\t--input-dir Directory containing shader sources (required)\n"
|
||||
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
|
||||
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
|
||||
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
|
||||
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
if (args.find("--glslc") != args.end()) {
|
||||
GLSLC = args["--glslc"]; // Path to glslc
|
||||
}
|
||||
|
@ -497,7 +524,7 @@ int main(int argc, char** argv) {
|
|||
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
||||
}
|
||||
if (args.find("--no-clean") != args.end()) {
|
||||
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
||||
clean = false; // Keep temporary SPIR-V files in output-dir after build
|
||||
}
|
||||
|
||||
if (!directory_exists(input_dir)) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue