diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 1683da2af..25ffd9126 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2531,47 +2531,6 @@ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu * multiplication will be stored. */ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* weight = dst->src[0]; // weight - ggml_tensor* input = dst->src[1]; // input - - // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto - // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. - BCAST_MUL_MAT_SHAPE(input, weight, dst); - - // transpose weight: [1,2,3,4] -> [1,2,4,3] - int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], - bcast_weight_ne[4], bcast_weight_ne[5]}; - size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], - bcast_weight_nb[4], bcast_weight_nb[5]}; - - aclTensor* acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims); - aclTensor* acl_input_tensor = - ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input)); - aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst)); - aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); - - ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); -} - -/** - * @brief Performs matrix multiplication with floating-point precision on - * tensors using the CANN backend. - * - * This function performs matrix multiplication of the input tensor and the - * weight tensor, handling broadcasting and transposing as needed, and stores - * the result in the destination tensor `dst`. - * - * @param ctx The context for the CANN backend operations. - * @param dst The destination tensor where the result of the matrix - * multiplication will be stored. - */ -static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* weight = dst->src[0]; // weight ggml_tensor* input = dst->src[1]; // input @@ -2637,158 +2596,6 @@ static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx, * multiplication will be stored. */ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - const enum ggml_type type) { - ggml_tensor* src0 = dst->src[0]; // weight - ggml_tensor* src1 = dst->src[1]; // input - - // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC - // is regarded as batch. weight need transpose. - int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; - float weight_elem_size; - if (type == GGML_TYPE_Q4_0) { - weight_elem_size = float(sizeof(uint8_t)) / 2; - } - else if (type == GGML_TYPE_Q8_0) { - weight_elem_size = float(sizeof(uint8_t)); - } - else { - GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); - } - float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; - - // size of one matrix is element_size * height * width. - size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; - size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; - - // scale stored at the end of weight. Also need transpose. - GGML_ASSERT(QK4_0 == QK8_0); - int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; - size_t scale_elem_size = sizeof(uint16_t); - size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, - scale_elem_size}; - size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0; - char* scale_offset = (char*)src0->data + weight_size; - - // input - void* input_buffer; - size_t input_elem_size = sizeof(uint16_t); - int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; - size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; - size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; - - ggml_cann_pool_alloc input_alloctor(ctx.pool()); - if (src1->type != GGML_TYPE_F16) { - aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); - input_buffer = input_alloctor.get(); - - int64_t* input_cast_ne = src1->ne; - size_t input_cast_nb[GGML_MAX_DIMS]; - input_cast_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; - } - - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, - input_cast_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); - } else { - input_buffer = src1->data; - } - - // output - size_t output_elem_size = sizeof(uint16_t); - int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; - size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; - ggml_cann_pool_alloc output_alloctor( - ctx.pool(), ggml_nelements(dst) * output_elem_size); - void* output_buffer = output_alloctor.get(); - size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; - - // aclnn - uint64_t workspaceSize = 0; - aclOpExecutor* executor; - void* workspaceAddr = nullptr; - - for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { - for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { - int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); - int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); - - int64_t batch1 = n1 * src1->ne[2] + c1; - int64_t batch0 = n0 * src0->ne[2] + c0; - - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, - input_elem_size, input_ne, input_nb, 2); - aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), weight_elem_size, weight_ne, - weight_nb, 2); - aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2); - aclTensor* acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2); - - ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( - acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, - &workspaceSize, &executor)); - - if (workspaceSize > 0 && workspaceAddr == nullptr) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), - workspaceSize); - workspaceAddr = workspace_allocator.get(); - } - - ACL_CHECK(aclnnWeightQuantBatchMatmulV2( - workspaceAddr, workspaceSize, executor, ctx.stream())); - - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); - ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); - ACL_CHECK(aclDestroyTensor(acl_output_tensor)); - } - } - - // cast out - int64_t* output_cast_ne = dst->ne; - size_t output_cast_nb[GGML_MAX_DIMS]; - output_cast_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; - } - - aclTensor* acl_output_tensor = - ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, - output_cast_ne, output_cast_nb, GGML_MAX_DIMS); - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); - aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); - - ACL_CHECK(aclDestroyTensor(acl_output_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); -} - -/** - * @brief Performs matrix multiplication with quantized weights and - * floating-point inputs using the CANN backend. - * - * This function performs matrix multiplication of the input tensor `src1` and - * the weight tensor `src0`, handling broadcasting, transposing, and - * quantization as needed, and stores the result in the destination tensor - * `dst`. - * - * @param ctx The context for the CANN backend operations. - * @param dst The destination tensor where the result of the matrix - * multiplication will be stored. - */ -static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx, ggml_tensor* dst, const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight @@ -2979,11 +2786,11 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - ggml_cann_mat_mul_fp2(ctx, dst); + ggml_cann_mat_mul_fp(ctx, dst); break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_quant2(ctx, dst, type); + ggml_cann_mul_mat_quant(ctx, dst, type); break; default: GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index a9a1cfb63..531e87c7a 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -341,7 +341,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { std::vector map_offsets; /** - * @brief Constructor to initialize the buffer pool with virtual memory for * @brief Constructor to initialize the buffer pool with virtual memory for * a specific device. * @@ -1872,17 +1871,17 @@ struct ggml_backend_cann_device_context { }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ctx->name.c_str(); } static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ctx->description.c_str(); } static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; ggml_backend_cann_get_device_memory(ctx->device, free, total); } @@ -1909,7 +1908,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ggml_backend_cann_init(ctx->device); } @@ -1929,7 +1928,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons static bool ggml_backend_cann_supports_buft( ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { - ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context; return buft_ctx->device == dev_ctx->device; @@ -1938,7 +1937,7 @@ static bool ggml_backend_cann_supports_buft( } static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ggml_backend_cann_buffer_type(ctx->device); } @@ -1959,7 +1958,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( */ static ggml_backend_event_t ggml_backend_cann_device_event_new( ggml_backend_dev_t dev) { - ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; ggml_cann_set_device(dev_ctx->device); @@ -2067,7 +2066,11 @@ ggml_backend_reg_t ggml_backend_cann_reg() { ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; for (int i = 0; i < ggml_cann_info().device_count; i++) { - ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i); + ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); + dev_ctx->description = aclrtGetSocName(); + dev_ctx->device = i; + dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { /* .interface = */ ggml_backend_cann_device_interface, /* .reg = */ ®, @@ -2095,12 +2098,17 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { return nullptr; } - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device); + ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + if (ctx == nullptr) { + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return nullptr; + } + ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, - /* .device = */ dev, - /* .context = */ dev->context}; + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .context = */ ctx}; return cann_backend; }