refactoring ggml_qnn_tensor

This commit is contained in:
Hongrui Chen 2024-07-07 23:51:12 +08:00 committed by hongruichen
parent 874216b9c8
commit 5f2e3918f6
8 changed files with 301 additions and 212 deletions

View file

@ -1,5 +1,6 @@
#include "ggml-qnn.h" #include "ggml-qnn.h"
#include <list>
#include <stdatomic.h> #include <stdatomic.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1, .threads = 1,
.name = "qnn-cpu", .name = "qnn-cpu",
.lib = "libQnnCpu.so", .lib = "libQnnCpu.so",
.instance = nullptr,
.backend = nullptr, .backend = nullptr,
.raw_interface = {}, .raw_interface = {},
.raw_system_interface = {}, .raw_system_interface = {},
@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1, .threads = 1,
.name = "qnn-gpu", .name = "qnn-gpu",
.lib = "libQnnGpu.so", .lib = "libQnnGpu.so",
.instance = nullptr,
.backend = nullptr, .backend = nullptr,
.raw_interface = {}, .raw_interface = {},
.raw_system_interface = {}, .raw_system_interface = {},
@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1, .threads = 1,
.name = "qnn-npu", .name = "qnn-npu",
.lib = "libQnnHtp.so", .lib = "libQnnHtp.so",
.instance = nullptr,
.backend = nullptr, .backend = nullptr,
.raw_interface = {}, .raw_interface = {},
.raw_system_interface = {}, .raw_system_interface = {},
@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context {
ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {} ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {}
~ggml_backend_qnn_buffer_context() { ~ggml_backend_qnn_buffer_context() {
tensors.clear();
if (buffer) { if (buffer) {
free(buffer); free(buffer);
} }
for (auto *qnn_tensor : qnn_tensors) {
qnn::device_tensor_free(*qnn_tensor);
free(qnn_tensor);
} }
qnn_tensors.clear();
}
void *buffer = nullptr; void *buffer = nullptr;
struct ggml_backend_qnn_context *backend_ctx = nullptr; struct ggml_backend_qnn_context *backend_ctx = nullptr;
std::list<std::unique_ptr<qnn::ggml_qnn_tensor>> tensors;
size_t buffer_size = 0; size_t buffer_size = 0;
std::vector<Qnn_Tensor_t *> qnn_tensors;
size_t device; size_t device;
std::string name; std::string name;
}; };
@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu
GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) {
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); auto instance = ctx->backend_ctx->instance;
if (!p_qnn_tensor) { auto qnn_tensor = std::make_unique<qnn::ggml_qnn_tensor>(tensor, (QNNBackend)(ctx->device), instance);
QNN_LOG_WARN("calloc failed"); if (!qnn_tensor->is_valid()) {
QNN_LOG_WARN("Create ggml_qnn_tensor failed");
return; return;
} }
static int idx = 0; ctx->tensors.push_back(std::move(qnn_tensor));
char tensor_name[GGML_MAX_NAME] = { 0 };
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++);
Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type);
Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor);
Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
if (ctx->device == QNN_BACKEND_GPU) {
qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
}
uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2],
(uint32_t)tensor->ne[3] };
Qnn_Tensor_t qnn_tensor;
qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type,
qnn_data_type, dimensions);
Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor);
if (error != QNN_SUCCESS) {
free(p_qnn_tensor);
QNN_LOG_WARN("init tensor failed");
return;
}
tensor->extra = p_qnn_tensor;
ctx->qnn_tensors.push_back(p_qnn_tensor);
} }
GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor,
@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context; ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context;
QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
auto *instance = g_qnn_mgr[ctx->device].instance; auto instance = g_qnn_mgr[ctx->device].instance;
if (instance != nullptr) { if (instance) {
for (const auto &graph_item : ctx->qnn_graph_map) { for (const auto &graph_item : ctx->qnn_binary_graph_cache) {
QNN_LOG_INFO("graph type:%s", graph_item.first.c_str()); QNN_LOG_INFO("graph type:%s", graph_item.first.c_str());
} }
ctx->qnn_graph_map.clear(); ctx->qnn_binary_graph_cache.clear();
instance->qnn_finalize(); instance->qnn_finalize();
delete instance; g_qnn_mgr[ctx->device].instance.reset();
g_qnn_mgr[ctx->device].instance = nullptr;
} }
if (g_qnn_mgr[ctx->device].backend != nullptr) { if (g_qnn_mgr[ctx->device].backend != nullptr) {
@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) {
} }
} }
auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); auto instance = std::make_shared<qnn::qnn_instance>(qnn_lib_path, g_qnn_mgr[device].lib, "");
result = instance->qnn_init(nullptr); result = instance->qnn_init(nullptr);
if (0 != result) { if (result != 0) {
QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device));
delete instance;
return nullptr; return nullptr;
} }
auto qnn_interface = instance->get_qnn_interface(); auto qnn_interface = instance->get_qnn_interface();
if (!qnn_interface.is_loaded()) { if (!qnn_interface.is_loaded()) {
QNN_LOG_WARN("qnn subsystem failure\n"); QNN_LOG_WARN("qnn subsystem failure\n");
delete instance;
return nullptr; return nullptr;
} }

View file

@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
return false; return false;
} }
auto *instance = ctx->instance; auto instance = ctx->instance;
auto *tensor0 = src0->extra; auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0);
auto *tensor1 = src1->extra; auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1);
auto *tensor2 = dst->extra; auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst);
if (!instance || !tensor0 || !tensor1 || !tensor2) { if (!instance || !tensor0 || !tensor1 || !tensor2) {
QNN_LOG_WARN("invalid tensors\n"); QNN_LOG_WARN("invalid tensors\n");
return false; return false;
@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
return true; return true;
} }
template <size_t _InputSize, size_t _OutputSize>
bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name,
const std::array<const ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
std::array<Qnn_Tensor_t, _InputSize> qnn_input_tensors;
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]);
if (!tensor || !tensor->bind_to_graph(*graph)) {
return false;
}
qnn_input_tensors[i] = tensor->get_qnn_tensor();
}
std::array<Qnn_Tensor_t, _OutputSize> qnn_output_tensors;
for (size_t i = 0; i < outputs.size(); ++i) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]);
if (!tensor || !tensor->bind_to_graph(*graph)) {
return false;
}
qnn_output_tensors[i] = tensor->get_qnn_tensor();
}
if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) {
return false;
}
return true;
}
template <size_t _InputSize>
bool write_to_qnn_tensors(const std::array<const ggml_tensor *, _InputSize> &inputs) {
for (auto &input : inputs) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input);
if (!tensor || !tensor->write_to_qnn_tensor()) {
return false;
}
}
return true;
}
template <size_t _OutputSize>
bool read_from_qnn_tensors(const std::array<ggml_tensor *, _OutputSize> &outputs) {
for (auto &output : outputs) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output);
if (!tensor || !tensor->read_from_qnn_tensor()) {
return false;
}
}
return true;
}
template <size_t _InputSize, size_t _OutputSize>
bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph,
const std::array<const ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
if (!write_to_qnn_tensors<_InputSize>(inputs)) {
return false;
}
if (!graph->execute()) {
return false;
}
if (!read_from_qnn_tensors<_OutputSize>(outputs)) {
return false;
}
return true;
}
} // namespace } // namespace
#ifndef NDEBUG #ifndef NDEBUG
@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
bool succeed = false; bool succeed = false;
std::string graph_key(ggml_op_name(GGML_OP_ADD)); std::string graph_key(ggml_op_name(GGML_OP_ADD));
auto it = ctx->qnn_graph_map.find(graph_key); auto it = ctx->qnn_binary_graph_cache.find(graph_key);
if (it != ctx->qnn_graph_map.end()) { qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
const auto &graph_item = it->second; if (it != ctx->qnn_binary_graph_cache.end()) {
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); graph_ptr = it->second.get();
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
std::get<0>(graph_item)->execute();
} else { } else {
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device), auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
goto failure; goto failure;
} }
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) {
if (!tensor_input0.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
if (!tensor_input1.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
if (!tensor_output.is_valid()) {
goto failure; goto failure;
} }
if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD, graph_ptr = graph.get();
{ *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
{ *tensor_output.get_qnn_tensor() })) {
goto failure;
} }
if (!graph->execute()) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
goto failure;
}
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
}
succeed = true;
failure: failure:
if (!succeed) { if (!succeed) {
@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
bool succeed = false; bool succeed = false;
std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT)); std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT));
auto it = ctx->qnn_graph_map.find(graph_key); auto it = ctx->qnn_binary_graph_cache.find(graph_key);
if (it != ctx->qnn_graph_map.end()) { qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
const auto &graph_item = it->second; if (it != ctx->qnn_binary_graph_cache.end()) {
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); graph_ptr = it->second.get();
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
std::get<0>(graph_item)->execute();
} else { } else {
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device), auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
goto failure; goto failure;
} }
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx); if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) {
if (!tensor_input0.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
if (!tensor_input1.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
if (!tensor_output.is_valid()) {
goto failure; goto failure;
} }
if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }, graph_ptr = graph.get();
{ *tensor_output.get_qnn_tensor() })) { ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
goto failure;
} }
if (!graph->execute()) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
goto failure;
}
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
}
succeed = true;
failure: failure:
if (!succeed) { if (!succeed) {

View file

@ -16,12 +16,10 @@ struct ggml_backend_qnn_context {
int threads; int threads;
char name[GGML_MAX_NAME]; char name[GGML_MAX_NAME];
char lib[GGML_MAX_NAME]; char lib[GGML_MAX_NAME];
qnn::qnn_instance *instance; std::shared_ptr<qnn::qnn_instance> instance;
ggml_backend *backend; ggml_backend *backend;
QNN_INTERFACE_VER_TYPE raw_interface; QNN_INTERFACE_VER_TYPE raw_interface;
QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
qnn::qcom_socinfo socinfo; qnn::qcom_socinfo socinfo;
std::unordered_map<std::string, std::tuple<std::unique_ptr<qnn::ggml_qnn_graph_binary>, Qnn_Tensor_t *, std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> qnn_binary_graph_cache;
Qnn_Tensor_t *, Qnn_Tensor_t *>>
qnn_graph_map;
}; };

View file

@ -73,6 +73,22 @@ public:
_graph_handle = graph_handle; _graph_handle = graph_handle;
} }
bool create_graph_tensor(Qnn_Tensor_t &tensor) {
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph\n");
return false;
}
auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor);
if (err != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", err);
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
return false;
}
return true;
}
bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs, bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs,
const output_tensor_array_t &tensor_outputs) { const output_tensor_array_t &tensor_outputs) {
if (!is_valid()) { if (!is_valid()) {
@ -124,6 +140,8 @@ public:
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
const std::string &get_name() const { return _graph_name; }
private: private:
const std::string _graph_name; const std::string _graph_name;
const QNNBackend _device; const QNNBackend _device;

View file

@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
} // namespace qnn } // namespace qnn
#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN
#define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_DEFAULT_FLAGS 1
#define RPCMEM_HEAP_ID_SYSTEM 25 #define RPCMEM_HEAP_ID_SYSTEM 25

View file

@ -637,20 +637,20 @@ public:
return 3; return 3;
} }
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) {
QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
return 4; return 4;
} }
int32_t mem_fd = rpcmem_to_fd(p_data); int32_t mem_fd = rpcmem_to_fd(p_data);
if (-1 == mem_fd) { if (mem_fd == -1) {
QNN_LOG_WARN("failed to get file descriptor\n"); QNN_LOG_WARN("failed to get file descriptor\n");
return 5; return 5;
} }
QNN_LOG_INFO("mem_fd %d\n", mem_fd); QNN_LOG_INFO("mem_fd %d\n", mem_fd);
Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor),
nullptr }, nullptr },
QNN_VER_PTR(*p_tensor)->dataType, QNN_TENSOR_GET_DATA_TYPE(*p_tensor),
QNN_MEM_TYPE_ION, QNN_MEM_TYPE_ION,
{ { mem_fd } } }; { { mem_fd } } };
Qnn_MemHandle_t handle = nullptr; Qnn_MemHandle_t handle = nullptr;
@ -662,9 +662,10 @@ public:
strerror(error)); strerror(error));
return 6; return 6;
} else { } else {
QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
} }
QNN_VER_PTR(*p_tensor)->memHandle = handle;
QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle);
_qnn_mem_set.insert((std::pair<void *, Qnn_MemHandle_t>(p_data, handle))); _qnn_mem_set.insert((std::pair<void *, Qnn_MemHandle_t>(p_data, handle)));
return 0; return 0;

View file

@ -1,127 +1,197 @@
#pragma once #pragma once
#include <atomic>
#include <memory>
#include <string>
#include "ggml-qnn.h" #include "ggml-qnn.h"
#include "QnnTensor.h" #include "QnnTensor.h"
#include "System/QnnSystemInterface.h" #include "System/QnnSystemInterface.h"
#include "backend.hpp" #include "backend.hpp"
#include "graph.hpp"
#include "qnn.hpp" #include "qnn.hpp"
#include "utils.hpp"
namespace qnn { namespace qnn {
template <Qnn_TensorType_t _tensorType> class ggml_qnn_tensor {
class ggml_qnn_tensor_readwrite {
public: public:
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle, static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) {
ggml_backend_qnn_context *ctx) : if (!tensor) {
_tensor(tensor), _qnn_tensor(reinterpret_cast<Qnn_Tensor_t *>(tensor->extra)), _context(ctx) { return nullptr;
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
QNN_VER_PTR(*_qnn_tensor)->type = _tensorType;
if (is_npu) {
QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 };
} }
auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor); return static_cast<ggml_qnn_tensor *>(tensor->extra);
if (err != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", err);
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
return;
} }
explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance) :
_tensor(tensor), _device(device), _qnn_instance(qnn_instance) {
_tensor_name = ggml_get_name(tensor);
if (_tensor_name.empty()) {
static std::atomic_uint32_t unnamed_tensor_count = 0;
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++);
_tensor_name = buffer;
}
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
_dimensions[0] = (uint32_t)tensor->ne[0]; _dimensions[0] = (uint32_t)tensor->ne[0];
_dimensions[1] = (uint32_t)tensor->ne[1]; _dimensions[1] = (uint32_t)tensor->ne[1];
_dimensions[2] = (uint32_t)tensor->ne[2]; _dimensions[2] = (uint32_t)tensor->ne[2];
_dimensions[3] = (uint32_t)tensor->ne[3]; _dimensions[3] = (uint32_t)tensor->ne[3];
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions; QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions);
QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor); QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor));
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type; QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type));
// TODO: set the quantizeParams base on the tensor type
QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor));
const bool is_npu = device == QNN_BACKEND_NPU;
if (is_npu) { if (is_npu) {
auto *instance = ctx->instance; QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
uint8_t *qnn_buffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *))); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr);
} else {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) };
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
}
tensor->extra = this;
}
template <size_t _InputSize, size_t _OutputSize>
bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
if (_graph_handle) {
if (_graph_handle != graph.get_graph_handler()) {
QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str());
return false;
} else {
QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(),
graph.get_name().c_str());
return true;
}
}
Qnn_Tensor_t tensor = _qnn_tensor;
if (!graph.create_graph_tensor(tensor)) {
QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str());
return false;
}
if (!alloc_rpc_mem()) {
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
return false;
}
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor));
_graph_handle = graph.get_graph_handler();
return true;
}
bool write_to_qnn_tensor() {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str());
return false;
}
if (should_use_mem_handle()) {
uint8_t *qnn_buffer = static_cast<uint8_t *>(
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
if (qnn_buffer) {
memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor));
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
return false;
}
}
// For CPU and GPU, the data is already in the tensor.
return true;
}
bool read_from_qnn_tensor() {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str());
return false;
}
if (should_use_mem_handle()) {
uint8_t *qnn_buffer = static_cast<uint8_t *>(
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
if (qnn_buffer) {
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
return false;
}
}
// For CPU and GPU, the data is already in the tensor.
return true;
}
bool is_valid() const { return _tensor; }
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
private:
bool alloc_rpc_mem() {
if (!should_use_mem_handle()) {
return true;
}
uint8_t *qnn_buffer =
static_cast<uint8_t *>(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *)));
if (!qnn_buffer) { if (!qnn_buffer) {
QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno)); QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno));
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor)); QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
_context = nullptr; return false;
// No free for _qnn_tensor, because it's not registered.
return;
} else { } else {
QNN_LOG_INFO("alloc rpcmem successfully\n"); QNN_LOG_INFO("alloc rpcmem successfully\n");
} }
instance->register_rpcmem(qnn_buffer, _qnn_tensor); auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor);
if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) { if (error != QNN_SUCCESS) {
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor)); QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error);
} QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
} else { return false;
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
}
} }
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor, return true;
ggml_backend_qnn_context *ctx) :
_tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) {
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
_dimensions[0] = (uint32_t)tensor->ne[0];
_dimensions[1] = (uint32_t)tensor->ne[1];
_dimensions[2] = (uint32_t)tensor->ne[2];
_dimensions[3] = (uint32_t)tensor->ne[3];
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions;
QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor);
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type;
if (is_npu) {
uint8_t *qnn_buffer =
static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
if (qnn_buffer) {
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
return;
}
} else {
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
}
} }
~ggml_qnn_tensor_readwrite() { bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; }
if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context &&
_context->device == QNN_BACKEND_NPU) {
uint8_t *qnn_buffer = static_cast<uint8_t *>(
_context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
}
QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions;
}
bool is_valid() const { return _context; }
Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; }
private:
const ggml_tensor *_tensor; const ggml_tensor *_tensor;
Qnn_Tensor_t *_qnn_tensor; QNNBackend _device;
ggml_backend_qnn_context *_context; std::shared_ptr<qnn_instance> _qnn_instance;
uint32_t *_old_dimensions; Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT;
uint32_t _dimensions[4] = {}; uint32_t _dimensions[4] = {};
std::string _tensor_name;
Qnn_GraphHandle_t _graph_handle = nullptr;
ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete; ggml_qnn_tensor(const ggml_qnn_tensor &) = delete;
void operator=(const ggml_qnn_tensor_readwrite &) = delete; void operator=(const ggml_qnn_tensor &) = delete;
ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete; ggml_qnn_tensor(ggml_qnn_tensor &&) = delete;
void operator=(ggml_qnn_tensor_readwrite &&) = delete; void operator=(ggml_qnn_tensor &&) = delete;
}; };
using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_READ>;
using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_WRITE>;
} // namespace qnn } // namespace qnn

View file

@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) {
return QNN_TENSORMEMTYPE_UNDEFINED; return QNN_TENSORMEMTYPE_UNDEFINED;
} }
inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) {
if (tensor.version == QNN_TENSOR_VERSION_1) {
return tensor.v1.memHandle;
}
return nullptr;
}
inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) { inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) {
if (tensor.version == QNN_TENSOR_VERSION_1) { if (tensor.version == QNN_TENSOR_VERSION_1) {
tensor.v1.id = id; tensor.v1.id = id;
@ -224,6 +231,7 @@ public:
#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor)
#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor)
#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor)
#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor)
#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value)
#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value)