improve inferencing performance for ascend npu.
Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>
This commit is contained in:
parent
3952a221af
commit
f0e09002c3
3 changed files with 413 additions and 72 deletions
|
@ -32,6 +32,8 @@
|
||||||
#include <aclnnop/aclnn_group_norm.h>
|
#include <aclnnop/aclnn_group_norm.h>
|
||||||
#include <aclnnop/aclnn_index_fill_tensor.h>
|
#include <aclnnop/aclnn_index_fill_tensor.h>
|
||||||
#include <aclnnop/aclnn_layer_norm.h>
|
#include <aclnnop/aclnn_layer_norm.h>
|
||||||
|
#include <aclnnop/aclnn_mm.h>
|
||||||
|
#include <aclnnop/aclnn_batch_matmul.h>
|
||||||
#include <aclnnop/aclnn_matmul.h>
|
#include <aclnnop/aclnn_matmul.h>
|
||||||
#include <aclnnop/aclnn_max_pool.h>
|
#include <aclnnop/aclnn_max_pool.h>
|
||||||
#include <aclnnop/aclnn_permute.h>
|
#include <aclnnop/aclnn_permute.h>
|
||||||
|
@ -2407,7 +2409,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||||
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
||||||
// fp32, atlas a2 will transpose it to HFLOAT32.
|
// fp32, atlas a2 will transpose it to HFLOAT32.
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
void* workspaceAddr = nullptr;
|
void* workspaceAddr = nullptr;
|
||||||
|
@ -2425,6 +2426,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||||
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Performs matrix multiplication of two 2D tensors.
|
||||||
|
*
|
||||||
|
* This function computes the matrix multiplication of the input tensor
|
||||||
|
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||||
|
* destination tensor `acl_dst`.
|
||||||
|
* The operation is defined as:
|
||||||
|
* \f[
|
||||||
|
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||||
|
* \f]
|
||||||
|
*
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param acl_input The input tensor for the matrix multiplication.
|
||||||
|
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||||
|
* @param acl_dst The destination tensor where the result of the matrix
|
||||||
|
* multiplication will be stored.
|
||||||
|
*/
|
||||||
|
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||||
|
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||||
|
int8_t cube_math_type = 2;
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor* executor;
|
||||||
|
void* workspaceAddr = nullptr;
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||||
|
cube_math_type, &workspaceSize,
|
||||||
|
&executor));
|
||||||
|
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(
|
||||||
|
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Performs matrix multiplication of two 3D tensors.
|
||||||
|
*
|
||||||
|
* This function computes the matrix multiplication of the input tensor
|
||||||
|
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||||
|
* destination tensor `acl_dst`.
|
||||||
|
* The operation is defined as:
|
||||||
|
* \f[
|
||||||
|
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||||
|
* \f]
|
||||||
|
*
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param acl_input The input tensor for the matrix multiplication.
|
||||||
|
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||||
|
* @param acl_dst The destination tensor where the result of the matrix
|
||||||
|
* multiplication will be stored.
|
||||||
|
*/
|
||||||
|
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||||
|
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||||
|
int8_t cube_math_type = 2;
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
aclOpExecutor* executor;
|
||||||
|
void* workspaceAddr = nullptr;
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||||
|
cube_math_type, &workspaceSize,
|
||||||
|
&executor));
|
||||||
|
|
||||||
|
if (workspaceSize > 0) {
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||||
|
workspaceAddr = workspace_allocator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(
|
||||||
|
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Performs matrix multiplication with floating-point precision on
|
* @brief Performs matrix multiplication with floating-point precision on
|
||||||
* tensors using the CANN backend.
|
* tensors using the CANN backend.
|
||||||
|
@ -2466,6 +2541,70 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Performs matrix multiplication with floating-point precision on
|
||||||
|
* tensors using the CANN backend.
|
||||||
|
*
|
||||||
|
* This function performs matrix multiplication of the input tensor and the
|
||||||
|
* weight tensor, handling broadcasting and transposing as needed, and stores
|
||||||
|
* the result in the destination tensor `dst`.
|
||||||
|
*
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param dst The destination tensor where the result of the matrix
|
||||||
|
* multiplication will be stored.
|
||||||
|
*/
|
||||||
|
static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx,
|
||||||
|
ggml_tensor* dst) {
|
||||||
|
ggml_tensor* weight = dst->src[0]; // weight
|
||||||
|
ggml_tensor* input = dst->src[1]; // input
|
||||||
|
|
||||||
|
// when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
|
||||||
|
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
||||||
|
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
||||||
|
|
||||||
|
int64_t n_dims = bcast_dims;
|
||||||
|
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
|
||||||
|
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
|
||||||
|
n_dims = 2;
|
||||||
|
} else if (bcast_input_ne[2] == 1) {
|
||||||
|
n_dims = 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
aclTensor* acl_input_tensor =
|
||||||
|
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
||||||
|
int64_t transpose_ne[] = {
|
||||||
|
bcast_weight_ne[1], bcast_weight_ne[0],
|
||||||
|
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||||
|
bcast_weight_ne[4], bcast_weight_ne[5]
|
||||||
|
};
|
||||||
|
size_t transpose_nb[] = {
|
||||||
|
bcast_weight_nb[1], bcast_weight_nb[0],
|
||||||
|
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||||
|
bcast_weight_nb[4], bcast_weight_nb[5]
|
||||||
|
};
|
||||||
|
aclTensor* acl_weight_tensor =
|
||||||
|
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||||
|
aclTensor* acl_dst =
|
||||||
|
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||||
|
|
||||||
|
switch (n_dims) {
|
||||||
|
case 2:
|
||||||
|
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Performs matrix multiplication with quantized weights and
|
* @brief Performs matrix multiplication with quantized weights and
|
||||||
* floating-point inputs using the CANN backend.
|
* floating-point inputs using the CANN backend.
|
||||||
|
@ -2618,16 +2757,215 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Performs matrix multiplication with quantized weights and
|
||||||
|
* floating-point inputs using the CANN backend.
|
||||||
|
*
|
||||||
|
* This function performs matrix multiplication of the input tensor `src1` and
|
||||||
|
* the weight tensor `src0`, handling broadcasting, transposing, and
|
||||||
|
* quantization as needed, and stores the result in the destination tensor
|
||||||
|
* `dst`.
|
||||||
|
*
|
||||||
|
* @param ctx The context for the CANN backend operations.
|
||||||
|
* @param dst The destination tensor where the result of the matrix
|
||||||
|
* multiplication will be stored.
|
||||||
|
*/
|
||||||
|
static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx,
|
||||||
|
ggml_tensor* dst,
|
||||||
|
const enum ggml_type type) {
|
||||||
|
ggml_tensor* src0 = dst->src[0]; // weight
|
||||||
|
ggml_tensor* src1 = dst->src[1]; // input
|
||||||
|
|
||||||
|
// The shape of the weight is NCHW.
|
||||||
|
// Matrix multiplication uses HW dims.
|
||||||
|
// HC is regarded as batch.
|
||||||
|
// weight need transpose.
|
||||||
|
float weight_elem_size;
|
||||||
|
if (type == GGML_TYPE_Q4_0) {
|
||||||
|
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||||
|
} else if (type == GGML_TYPE_Q8_0) {
|
||||||
|
weight_elem_size = float(sizeof(uint8_t));
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
||||||
|
}
|
||||||
|
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
|
||||||
|
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
||||||
|
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
||||||
|
|
||||||
|
// scale stored at the end of weight.
|
||||||
|
// scale need transpose.
|
||||||
|
size_t scale_elem_size = sizeof(uint16_t);
|
||||||
|
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
||||||
|
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||||
|
char* scale_offset = (char*)src0->data + weight_size;
|
||||||
|
|
||||||
|
// input
|
||||||
|
size_t input_elem_size = sizeof(uint16_t);
|
||||||
|
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
||||||
|
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||||
|
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
||||||
|
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||||
|
void* input_buffer = src1->data;
|
||||||
|
|
||||||
|
// case in
|
||||||
|
if (src1->type != GGML_TYPE_F16) {
|
||||||
|
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||||
|
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||||
|
|
||||||
|
int64_t* input_cast_ne = src1->ne;
|
||||||
|
size_t input_cast_nb[GGML_MAX_DIMS];
|
||||||
|
input_cast_nb[0] = sizeof(uint16_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||||
|
input_buffer,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
||||||
|
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
// output
|
||||||
|
size_t output_elem_size = sizeof(uint16_t);
|
||||||
|
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
||||||
|
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||||
|
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||||
|
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
||||||
|
|
||||||
|
// aclnn
|
||||||
|
int64_t max_elem_size = 65535;
|
||||||
|
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||||
|
aclOpExecutor* executor = nullptr;
|
||||||
|
uint64_t workspaceSize = 0;
|
||||||
|
void* workspaceAddr = nullptr;
|
||||||
|
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
||||||
|
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
||||||
|
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
||||||
|
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
||||||
|
|
||||||
|
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
||||||
|
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
||||||
|
|
||||||
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||||
|
(char*)input_buffer + batch1 * input_stride,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
input_elem_size, input_ne, input_nb, 2);
|
||||||
|
|
||||||
|
// first split
|
||||||
|
int64_t weight_ne_offset = 0;
|
||||||
|
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
||||||
|
int64_t scale_ne_offset = 0;
|
||||||
|
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
||||||
|
int64_t output_ne_offset = 0;
|
||||||
|
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
|
||||||
|
|
||||||
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||||
|
(char*)src0->data + batch0 * weight_stride,
|
||||||
|
ggml_cann_type_mapping(type),
|
||||||
|
weight_elem_size, weight_ne, weight_nb, 2,
|
||||||
|
ACL_FORMAT_ND, weight_ne_offset);
|
||||||
|
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||||
|
scale_offset + batch0 * scale_stride,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
scale_elem_size, scale_ne, scale_nb, 2,
|
||||||
|
ACL_FORMAT_ND, scale_ne_offset);
|
||||||
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||||
|
(char*)output_buffer + batch1 * output_stride,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
output_elem_size, output_ne, output_nb, 2,
|
||||||
|
ACL_FORMAT_ND, output_ne_offset);
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||||
|
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||||
|
acl_output_tensor, &workspaceSize, &executor));
|
||||||
|
if (workspaceAddr == nullptr) {
|
||||||
|
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||||
|
}
|
||||||
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||||
|
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||||
|
|
||||||
|
// other splits
|
||||||
|
for (int64_t split = 1; split < split_size; split++) {
|
||||||
|
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||||
|
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
||||||
|
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
||||||
|
scale_ne[0] = weight_ne[0];
|
||||||
|
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
||||||
|
output_ne[0] = weight_ne[0];
|
||||||
|
|
||||||
|
acl_weight_tensor = ggml_cann_create_tensor(
|
||||||
|
(char*)src0->data + batch0 * weight_stride,
|
||||||
|
ggml_cann_type_mapping(type),
|
||||||
|
weight_elem_size, weight_ne, weight_nb, 2,
|
||||||
|
ACL_FORMAT_ND, weight_ne_offset);
|
||||||
|
acl_scale_tensor = ggml_cann_create_tensor(
|
||||||
|
scale_offset + batch0 * scale_stride,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
scale_elem_size, scale_ne, scale_nb, 2,
|
||||||
|
ACL_FORMAT_ND, scale_ne_offset);
|
||||||
|
acl_output_tensor = ggml_cann_create_tensor(
|
||||||
|
(char*)output_buffer + batch1 * output_stride,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
output_elem_size, output_ne, output_nb, 2,
|
||||||
|
ACL_FORMAT_ND, output_ne_offset);
|
||||||
|
|
||||||
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||||
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||||
|
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||||
|
acl_output_tensor, &workspaceSize, &executor));
|
||||||
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||||
|
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cast out
|
||||||
|
if (dst->type != GGML_TYPE_F16) {
|
||||||
|
int64_t* output_cast_ne = dst->ne;
|
||||||
|
size_t output_cast_nb[GGML_MAX_DIMS];
|
||||||
|
output_cast_nb[0] = sizeof(uint16_t);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||||
|
output_buffer,
|
||||||
|
ACL_FLOAT16,
|
||||||
|
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||||
|
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||||
|
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||||
|
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
const enum ggml_type type = dst->src[0]->type;
|
const enum ggml_type type = dst->src[0]->type;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
ggml_cann_mat_mul_fp(ctx, dst);
|
ggml_cann_mat_mul_fp2(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
ggml_cann_mul_mat_quant(ctx, dst, type);
|
ggml_cann_mul_mat_quant2(ctx, dst, type);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
|
|
@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
|
||||||
struct ggml_backend_cann_context {
|
struct ggml_backend_cann_context {
|
||||||
int32_t device; /**< Device ID. */
|
int32_t device; /**< Device ID. */
|
||||||
std::string name; /**< Name of the device. */
|
std::string name; /**< Name of the device. */
|
||||||
|
std::string description; /**< Description of the device. */
|
||||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||||
|
|
||||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
|
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||||
{nullptr}}; /**< Array of streams for the device. */
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Constructor for initializing the context with a given device.
|
* @brief Constructor for initializing the context with a given device.
|
||||||
* @param device Device ID.
|
* @param device Device ID.
|
||||||
*/
|
*/
|
||||||
explicit ggml_backend_cann_context(int device)
|
explicit ggml_backend_cann_context(int device)
|
||||||
: device(device), name("CANN" + std::to_string(device)) {}
|
: device(device), name("CANN" + std::to_string(device)) {
|
||||||
|
ggml_cann_set_device(device);
|
||||||
|
description = aclrtGetSocName();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Destructor for cleaning up resources.
|
* @brief Destructor for cleaning up resources.
|
||||||
|
|
|
@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
|
||||||
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
||||||
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
||||||
&info.devices[id].vmm_granularity));
|
&info.devices[id].vmm_granularity));
|
||||||
|
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_cann_get_device_memory(id, &free, &total);
|
||||||
|
info.devices[id].total_vram = free;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: add more device info later.
|
// TODO: add more device info later.
|
||||||
|
@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||||
* @return A pointer to the allocated buffer.
|
* @return A pointer to the allocated buffer.
|
||||||
*/
|
*/
|
||||||
void* alloc(size_t size, size_t* actual_size) override {
|
void* alloc(size_t size, size_t* actual_size) override {
|
||||||
|
const size_t alignment = 128;
|
||||||
|
size = GGML_PAD(size, alignment);
|
||||||
|
if (size == 0) {
|
||||||
|
size = alignment;
|
||||||
|
}
|
||||||
#ifdef DEBUG_CANN_MALLOC
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
int nnz = 0;
|
int nnz = 0;
|
||||||
size_t max_size = 0;
|
size_t max_size = 0;
|
||||||
|
@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
void* ptr;
|
void* ptr;
|
||||||
size_t look_ahead_size = (size_t)(1.05 * size);
|
|
||||||
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
|
|
||||||
ggml_cann_set_device(device);
|
ggml_cann_set_device(device);
|
||||||
ACL_CHECK(
|
ACL_CHECK(
|
||||||
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = size;
|
||||||
pool_size += look_ahead_size;
|
pool_size += size;
|
||||||
#ifdef DEBUG_CANN_MALLOC
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
GGML_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||||
|
@ -294,9 +301,9 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||||
*/
|
*/
|
||||||
struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
/**
|
/**
|
||||||
* @brief The maximum size of the virtual memory pool (32 GB).
|
* @brief The maximum size of the virtual memory pool.
|
||||||
*/
|
*/
|
||||||
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
size_t max_size;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief The device ID associated with this buffer pool.
|
* @brief The device ID associated with this buffer pool.
|
||||||
|
@ -334,6 +341,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
std::vector<void*> map_offsets;
|
std::vector<void*> map_offsets;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @brief Constructor to initialize the buffer pool with virtual memory for
|
||||||
* @brief Constructor to initialize the buffer pool with virtual memory for
|
* @brief Constructor to initialize the buffer pool with virtual memory for
|
||||||
* a specific device.
|
* a specific device.
|
||||||
*
|
*
|
||||||
|
@ -341,7 +349,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
*/
|
*/
|
||||||
explicit ggml_cann_pool_vmm(int device)
|
explicit ggml_cann_pool_vmm(int device)
|
||||||
: device(device),
|
: device(device),
|
||||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {}
|
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
||||||
|
auto dev = ggml_cann_info().devices[device];
|
||||||
|
granularity = dev.vmm_granularity;
|
||||||
|
max_size = dev.total_vram;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Destructor to free all buffers in the virtual memory pool.
|
* @brief Destructor to free all buffers in the virtual memory pool.
|
||||||
|
@ -370,17 +382,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
// round up the allocation size to the alignment to ensure that all
|
// round up the allocation size to the alignment to ensure that all
|
||||||
// allocations are aligned for all data types
|
// allocations are aligned for all data types
|
||||||
const size_t alignment = 128;
|
const size_t alignment = 128;
|
||||||
size = alignment * ((size + alignment - 1) / alignment);
|
size = GGML_PAD(size, alignment);
|
||||||
|
if (size == 0) {
|
||||||
|
size = alignment;
|
||||||
|
}
|
||||||
|
|
||||||
size_t avail = pool_size - pool_used;
|
size_t avail = pool_size - pool_used;
|
||||||
|
|
||||||
if (size > avail) {
|
if (size > avail) {
|
||||||
// round up to the next multiple of the granularity
|
// round up to the next multiple of the granularity
|
||||||
size_t reserve_size = size - avail;
|
size_t reserve_size = size - avail;
|
||||||
reserve_size =
|
reserve_size = GGML_PAD(reserve_size, granularity);
|
||||||
granularity * ((reserve_size + granularity - 1) / granularity);
|
|
||||||
|
|
||||||
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
|
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
||||||
|
|
||||||
// allocate more physical memory
|
// allocate more physical memory
|
||||||
aclrtPhysicalMemProp prop = {};
|
aclrtPhysicalMemProp prop = {};
|
||||||
|
@ -396,7 +410,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
// reserve virtual address space (if not already reserved)
|
// reserve virtual address space (if not already reserved)
|
||||||
if (pool_addr == 0) {
|
if (pool_addr == 0) {
|
||||||
ACL_CHECK(aclrtReserveMemAddress(
|
ACL_CHECK(aclrtReserveMemAddress(
|
||||||
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
|
&pool_addr, max_size, 0, NULL, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// map at the end of the pool
|
// map at the end of the pool
|
||||||
|
@ -409,10 +423,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
// add to the pool
|
// add to the pool
|
||||||
pool_size += reserve_size;
|
pool_size += reserve_size;
|
||||||
|
|
||||||
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
// reserved %llu MB)\n",
|
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||||
// device, (unsigned long long) (pool_size/1024/1024),
|
device, (unsigned long long) (pool_size/1024/1024),
|
||||||
// (unsigned long long) (reserve_size/1024/1024));
|
(unsigned long long) (reserve_size/1024/1024));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(pool_addr != 0);
|
GGML_ASSERT(pool_addr != 0);
|
||||||
|
@ -457,9 +472,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
*/
|
*/
|
||||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||||
int device) {
|
int device) {
|
||||||
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
|
if (device == 0) {
|
||||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||||
}
|
}
|
||||||
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
|
||||||
|
}
|
||||||
|
|
||||||
// cann buffer
|
// cann buffer
|
||||||
/**
|
/**
|
||||||
|
@ -470,23 +487,22 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||||
*/
|
*/
|
||||||
struct ggml_backend_cann_buffer_context {
|
struct ggml_backend_cann_buffer_context {
|
||||||
int32_t device; ///< The device ID associated with this buffer context.
|
int32_t device; ///< The device ID associated with this buffer context.
|
||||||
void* dev_ptr =
|
ggml_cann_pool_alloc* alloc; ///< Pointer to the device memory allocated for the buffer.
|
||||||
nullptr; ///< Pointer to the device memory allocated for the buffer.
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Constructor to initialize the CANN buffer context.
|
* @brief Constructor to initialize the CANN buffer context.
|
||||||
*
|
*
|
||||||
* @param device The device ID associated with this buffer context.
|
* @param device The device ID associated with this buffer context.
|
||||||
* @param dev_ptr Pointer to the device memory allocated for the buffer.
|
* @param alloc Pointer to the device memory allocated for the buffer.
|
||||||
*/
|
*/
|
||||||
ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
|
ggml_backend_cann_buffer_context(int32_t device, ggml_cann_pool_alloc* alloc)
|
||||||
: device(device),
|
: device(device),
|
||||||
dev_ptr(dev_ptr) {}
|
alloc(alloc) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Destructor to free the device memory allocated for the buffer.
|
* @brief Destructor to free the device memory allocated for the buffer.
|
||||||
*/
|
*/
|
||||||
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
|
~ggml_backend_cann_buffer_context() { delete alloc; }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -532,7 +548,7 @@ static void* ggml_backend_cann_buffer_get_base(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
return ctx->dev_ptr;
|
return ctx->alloc->get();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -939,7 +955,7 @@ static void ggml_backend_cann_buffer_clear(
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
|
|
||||||
ggml_cann_set_device(ctx->device);
|
ggml_cann_set_device(ctx->device);
|
||||||
ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
|
ACL_CHECK(aclrtMemset(ctx->alloc->get(), buffer->size, value, buffer->size));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1001,25 +1017,13 @@ static const char* ggml_backend_cann_buffer_type_name(
|
||||||
static ggml_backend_buffer_t
|
static ggml_backend_buffer_t
|
||||||
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_cann_buffer_type_context* buft_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_buffer_type_context*)buft->context;
|
(ggml_backend_cann_context*)buft->device->context;
|
||||||
|
|
||||||
ggml_cann_set_device(buft_ctx->device);
|
ggml_cann_pool_alloc* alloc = new ggml_cann_pool_alloc(cann_ctx->pool(), size);
|
||||||
|
|
||||||
size = std::max(size, (size_t)1);
|
|
||||||
|
|
||||||
void* dev_ptr;
|
|
||||||
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
|
||||||
if (err != ACL_SUCCESS) {
|
|
||||||
GGML_LOG_ERROR(
|
|
||||||
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
|
||||||
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
|
||||||
aclGetRecentErrMsg());
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
|
new ggml_backend_cann_buffer_context(cann_ctx->device, alloc);
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
|
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
|
||||||
ctx, size);
|
ctx, size);
|
||||||
|
@ -1130,10 +1134,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||||
static bool ggml_backend_cann_buffer_type_initialized = false;
|
static bool ggml_backend_cann_buffer_type_initialized = false;
|
||||||
|
|
||||||
if (!ggml_backend_cann_buffer_type_initialized) {
|
if (!ggml_backend_cann_buffer_type_initialized) {
|
||||||
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
|
||||||
ggml_backend_cann_buffer_types[i] = {
|
ggml_backend_cann_buffer_types[i] = {
|
||||||
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
||||||
/* .context = */
|
/* .context = */
|
||||||
new ggml_backend_cann_buffer_type_context{
|
new ggml_backend_cann_buffer_type_context{
|
||||||
i, "CANN" + std::to_string(i)},
|
i, "CANN" + std::to_string(i)},
|
||||||
|
@ -1199,10 +1203,15 @@ static void * ggml_cann_host_malloc(size_t size) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const size_t alignment = 128;
|
||||||
|
size = GGML_PAD(size, alignment);
|
||||||
|
if (size == 0) {
|
||||||
|
size = alignment;
|
||||||
|
}
|
||||||
|
|
||||||
void * hostPtr = nullptr;
|
void * hostPtr = nullptr;
|
||||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||||
if (err != ACL_SUCCESS) {
|
if (err != ACL_SUCCESS) {
|
||||||
|
|
||||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1863,17 +1872,17 @@ struct ggml_backend_cann_device_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
||||||
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
||||||
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
return ctx->description.c_str();
|
return ctx->description.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
ggml_backend_cann_get_device_memory(ctx->device, free, total);
|
ggml_backend_cann_get_device_memory(ctx->device, free, total);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1900,7 +1909,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
|
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
return ggml_backend_cann_init(ctx->device);
|
return ggml_backend_cann_init(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1920,7 +1929,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
|
||||||
static bool ggml_backend_cann_supports_buft(
|
static bool ggml_backend_cann_supports_buft(
|
||||||
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
if (ggml_backend_buft_is_cann(buft)) {
|
if (ggml_backend_buft_is_cann(buft)) {
|
||||||
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
ggml_backend_cann_buffer_type_context * buft_ctx =
|
ggml_backend_cann_buffer_type_context * buft_ctx =
|
||||||
(ggml_backend_cann_buffer_type_context *)buft->context;
|
(ggml_backend_cann_buffer_type_context *)buft->context;
|
||||||
return buft_ctx->device == dev_ctx->device;
|
return buft_ctx->device == dev_ctx->device;
|
||||||
|
@ -1929,7 +1938,7 @@ static bool ggml_backend_cann_supports_buft(
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
|
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context;
|
||||||
return ggml_backend_cann_buffer_type(ctx->device);
|
return ggml_backend_cann_buffer_type(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1950,7 +1959,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
||||||
*/
|
*/
|
||||||
static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
||||||
ggml_backend_dev_t dev) {
|
ggml_backend_dev_t dev) {
|
||||||
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
|
||||||
|
|
||||||
ggml_cann_set_device(dev_ctx->device);
|
ggml_cann_set_device(dev_ctx->device);
|
||||||
|
|
||||||
|
@ -2058,11 +2067,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
||||||
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
||||||
ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
|
ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i);
|
||||||
dev_ctx->description = aclrtGetSocName();
|
|
||||||
dev_ctx->device = i;
|
|
||||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
|
||||||
ggml_cann_set_device(i);
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .interface = */ ggml_backend_cann_device_interface,
|
/* .interface = */ ggml_backend_cann_device_interface,
|
||||||
/* .reg = */ ®,
|
/* .reg = */ ®,
|
||||||
|
@ -2090,17 +2095,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device);
|
||||||
if (ctx == nullptr) {
|
|
||||||
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ggml_cann_set_device(ctx->device);
|
|
||||||
ggml_backend_t cann_backend =
|
ggml_backend_t cann_backend =
|
||||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||||
/* .interface = */ ggml_backend_cann_interface,
|
/* .interface = */ ggml_backend_cann_interface,
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
/* .device = */ dev,
|
||||||
/* .context = */ ctx};
|
/* .context = */ dev->context};
|
||||||
|
|
||||||
return cann_backend;
|
return cann_backend;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue