diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index d4d9e2cd5..3584c4112 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -1,5 +1,6 @@ #include "ggml-qnn.h" +#include #include #include #include @@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", - .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}, @@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context { ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { + tensors.clear(); if (buffer) { free(buffer); } - - for (auto *qnn_tensor : qnn_tensors) { - qnn::device_tensor_free(*qnn_tensor); - free(qnn_tensor); - } - - qnn_tensors.clear(); } + void *buffer = nullptr; - struct ggml_backend_qnn_context *backend_ctx = nullptr; - + std::list> tensors; size_t buffer_size = 0; - std::vector qnn_tensors; size_t device; std::string name; }; @@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); - if (!p_qnn_tensor) { - QNN_LOG_WARN("calloc failed"); - return; - } - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor); - Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; - if (ctx->device == QNN_BACKEND_GPU) { - qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; - } - - uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - Qnn_Tensor_t qnn_tensor; - qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type, - qnn_data_type, dimensions); - - Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor); - if (error != QNN_SUCCESS) { - free(p_qnn_tensor); - QNN_LOG_WARN("init tensor failed"); + auto instance = ctx->backend_ctx->instance; + auto qnn_tensor = std::make_unique(tensor, (QNNBackend)(ctx->device), instance); + if (!qnn_tensor->is_valid()) { + QNN_LOG_WARN("Create ggml_qnn_tensor failed"); return; } - tensor->extra = p_qnn_tensor; - ctx->qnn_tensors.push_back(p_qnn_tensor); + ctx->tensors.push_back(std::move(qnn_tensor)); } GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, @@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - auto *instance = g_qnn_mgr[ctx->device].instance; - if (instance != nullptr) { - for (const auto &graph_item : ctx->qnn_graph_map) { + auto instance = g_qnn_mgr[ctx->device].instance; + if (instance) { + for (const auto &graph_item : ctx->qnn_binary_graph_cache) { QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); } - ctx->qnn_graph_map.clear(); + ctx->qnn_binary_graph_cache.clear(); instance->qnn_finalize(); - delete instance; - g_qnn_mgr[ctx->device].instance = nullptr; + g_qnn_mgr[ctx->device].instance.reset(); } if (g_qnn_mgr[ctx->device].backend != nullptr) { @@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) { } } - auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + auto instance = std::make_shared(qnn_lib_path, g_qnn_mgr[device].lib, ""); result = instance->qnn_init(nullptr); - if (0 != result) { + if (result != 0) { QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); - delete instance; return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); - delete instance; return nullptr; } diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 79e280fcb..1914e64dc 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return false; } - auto *instance = ctx->instance; - auto *tensor0 = src0->extra; - auto *tensor1 = src1->extra; - auto *tensor2 = dst->extra; + auto instance = ctx->instance; + auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0); + auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1); + auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst); if (!instance || !tensor0 || !tensor1 || !tensor2) { QNN_LOG_WARN("invalid tensors\n"); return false; @@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, return true; } +template +bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name, + const std::array &inputs, + const std::array &outputs) { + std::array qnn_input_tensors; + for (size_t i = 0; i < inputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_input_tensors[i] = tensor->get_qnn_tensor(); + } + + std::array qnn_output_tensors; + for (size_t i = 0; i < outputs.size(); ++i) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]); + if (!tensor || !tensor->bind_to_graph(*graph)) { + return false; + } + + qnn_output_tensors[i] = tensor->get_qnn_tensor(); + } + + if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) { + return false; + } + + return true; +} + +template +bool write_to_qnn_tensors(const std::array &inputs) { + for (auto &input : inputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input); + if (!tensor || !tensor->write_to_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool read_from_qnn_tensors(const std::array &outputs) { + for (auto &output : outputs) { + auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output); + if (!tensor || !tensor->read_from_qnn_tensor()) { + return false; + } + } + + return true; +} + +template +bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, + const std::array &inputs, + const std::array &outputs) { + if (!write_to_qnn_tensors<_InputSize>(inputs)) { + return false; + } + + if (!graph->execute()) { + return false; + } + + if (!read_from_qnn_tensors<_OutputSize>(outputs)) { + return false; + } + + return true; +} + } // namespace #ifndef NDEBUG @@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_ADD)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0, goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) { goto failure; } - if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, - { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { - goto failure; - } - - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { @@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s bool succeed = false; std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); - auto it = ctx->qnn_graph_map.find(graph_key); - if (it != ctx->qnn_graph_map.end()) { - const auto &graph_item = it->second; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - std::get<0>(graph_item)->execute(); + auto it = ctx->qnn_binary_graph_cache.find(graph_key); + qnn::ggml_qnn_graph_binary *graph_ptr = nullptr; + if (it != ctx->qnn_binary_graph_cache.end()) { + graph_ptr = it->second.get(); } else { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; auto graph = std::make_unique(graph_name, (QNNBackend)(ctx->device), @@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s goto failure; } - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx); - if (!tensor_output.is_valid()) { + if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) { goto failure; } - if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, - { *tensor_output.get_qnn_tensor() })) { - goto failure; - } - - if (!graph->execute()) { - goto failure; - } - - ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); + graph_ptr = graph.get(); + ctx->qnn_binary_graph_cache[graph_key] = std::move(graph); } - succeed = true; + succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); failure: if (!succeed) { diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index d60b334c0..48b243577 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -16,12 +16,10 @@ struct ggml_backend_qnn_context { int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; - qnn::qnn_instance *instance; + std::shared_ptr instance; ggml_backend *backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; qnn::qcom_socinfo socinfo; - std::unordered_map, Qnn_Tensor_t *, - Qnn_Tensor_t *, Qnn_Tensor_t *>> - qnn_graph_map; + std::unordered_map> qnn_binary_graph_cache; }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 651fc1c53..6f9628cbd 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -73,6 +73,22 @@ public: _graph_handle = graph_handle; } + bool create_graph_tensor(Qnn_Tensor_t &tensor) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph\n"); + return false; + } + + auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor); + if (err != QNN_SUCCESS) { + QNN_LOG_INFO("error = %d\n", err); + QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); + return false; + } + + return true; + } + bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, const output_tensor_array_t &tensor_outputs) { if (!is_valid()) { @@ -124,6 +140,8 @@ public: Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } + const std::string &get_name() const { return _graph_name; } + private: const std::string _graph_name; const QNNBackend _device; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7c2456510..58ca8648b 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); } // namespace qnn -#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN - #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 diff --git a/ggml/src/ggml-qnn/qnn.hpp b/ggml/src/ggml-qnn/qnn.hpp index 26465c96a..400ce005b 100644 --- a/ggml/src/ggml-qnn/qnn.hpp +++ b/ggml/src/ggml-qnn/qnn.hpp @@ -637,20 +637,20 @@ public: return 3; } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); return 4; } int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { + if (mem_fd == -1) { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, + Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor), nullptr }, - QNN_VER_PTR(*p_tensor)->dataType, + QNN_TENSOR_GET_DATA_TYPE(*p_tensor), QNN_MEM_TYPE_ION, { { mem_fd } } }; Qnn_MemHandle_t handle = nullptr; @@ -662,9 +662,10 @@ public: strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor)); } - QNN_VER_PTR(*p_tensor)->memHandle = handle; + + QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle); _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8a9196616..335aafe53 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,127 +1,197 @@ #pragma once +#include +#include +#include + #include "ggml-qnn.h" #include "QnnTensor.h" #include "System/QnnSystemInterface.h" #include "backend.hpp" +#include "graph.hpp" #include "qnn.hpp" +#include "utils.hpp" namespace qnn { -template -class ggml_qnn_tensor_readwrite { +class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(reinterpret_cast(tensor->extra)), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; - QNN_VER_PTR(*_qnn_tensor)->type = _tensorType; - if (is_npu) { - QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 }; + static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) { + if (!tensor) { + return nullptr; } - auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); - if (err != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", err); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; - } - - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; - - if (is_npu) { - auto *instance = ctx->instance; - uint8_t *qnn_buffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); - if (!qnn_buffer) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - // No free for _qnn_tensor, because it's not registered. - return; - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - - instance->register_rpcmem(qnn_buffer, _qnn_tensor); - if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); - } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; - } + return static_cast(tensor->extra); } - explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, - ggml_backend_qnn_context *ctx) : - _tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) { - _old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions; - const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type); - const bool is_npu = ctx->device == QNN_BACKEND_NPU; + explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr qnn_instance) : + _tensor(tensor), _device(device), _qnn_instance(qnn_instance) { + _tensor_name = ggml_get_name(tensor); + if (_tensor_name.empty()) { + static std::atomic_uint32_t unnamed_tensor_count = 0; + char buffer[GGML_MAX_NAME] = {}; + snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++); + _tensor_name = buffer; + } + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); _dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; - QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor); - QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions); + QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor)); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + // TODO: set the quantizeParams base on the tensor type + QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor)); + const bool is_npu = device == QNN_BACKEND_NPU; if (is_npu) { - uint8_t *qnn_buffer = - static_cast(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr); + } else { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + } + + tensor->extra = this; + } + + template + bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + if (_graph_handle) { + if (_graph_handle != graph.get_graph_handler()) { + QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str()); + return false; + } else { + QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(), + graph.get_name().c_str()); + return true; + } + } + + Qnn_Tensor_t tensor = _qnn_tensor; + if (!graph.create_graph_tensor(tensor)) { + QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str()); + return false; + } + + if (!alloc_rpc_mem()) { + QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor)); + _graph_handle = graph.get_graph_handler(); + return true; + } + + bool write_to_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str()); + return false; + } + + if (should_use_mem_handle()) { + uint8_t *qnn_buffer = static_cast( + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); if (qnn_buffer) { - memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); + memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor)); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); - QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); - _context = nullptr; - return; + return false; } - } else { - QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) }; } + + // For CPU and GPU, the data is already in the tensor. + return true; } - ~ggml_qnn_tensor_readwrite() { - if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context && - _context->device == QNN_BACKEND_NPU) { + bool read_from_qnn_tensor() { + if (!is_valid()) { + QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str()); + return false; + } + + auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); + if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { + QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str()); + return false; + } + + if (should_use_mem_handle()) { uint8_t *qnn_buffer = static_cast( - _context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle)); - memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + _qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor))); + if (qnn_buffer) { + memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor)); + } else { + QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + return false; + } } - QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions; + // For CPU and GPU, the data is already in the tensor. + return true; } - bool is_valid() const { return _context; } - Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; } + bool is_valid() const { return _tensor; } + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } private: - const ggml_tensor *_tensor; - Qnn_Tensor_t *_qnn_tensor; - ggml_backend_qnn_context *_context; - uint32_t *_old_dimensions; - uint32_t _dimensions[4] = {}; + bool alloc_rpc_mem() { + if (!should_use_mem_handle()) { + return true; + } - ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; - void operator=(const ggml_qnn_tensor_readwrite &) = delete; - ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; - void operator=(ggml_qnn_tensor_readwrite &&) = delete; + uint8_t *qnn_buffer = + static_cast(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *))); + if (!qnn_buffer) { + QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + + auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error); + QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str()); + return false; + } + + return true; + } + + bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + + const ggml_tensor *_tensor; + QNNBackend _device; + std::shared_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT; + uint32_t _dimensions[4] = {}; + std::string _tensor_name; + Qnn_GraphHandle_t _graph_handle = nullptr; + + ggml_qnn_tensor(const ggml_qnn_tensor &) = delete; + void operator=(const ggml_qnn_tensor &) = delete; + ggml_qnn_tensor(ggml_qnn_tensor &&) = delete; + void operator=(ggml_qnn_tensor &&) = delete; }; -using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite; -using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite; - } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 87d908f1e..84cd8354e 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) { return QNN_TENSORMEMTYPE_UNDEFINED; } +inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -224,6 +231,7 @@ public: #define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value)