refactoring ggml_qnn_tensor

This commit is contained in:
Hongrui Chen 2024-07-07 23:51:12 +08:00 committed by hongruichen
parent 874216b9c8
commit 5f2e3918f6
8 changed files with 301 additions and 212 deletions

View file

@ -1,5 +1,6 @@
#include "ggml-qnn.h"
#include <list>
#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1,
.name = "qnn-cpu",
.lib = "libQnnCpu.so",
.instance = nullptr,
.backend = nullptr,
.raw_interface = {},
.raw_system_interface = {},
@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1,
.name = "qnn-gpu",
.lib = "libQnnGpu.so",
.instance = nullptr,
.backend = nullptr,
.raw_interface = {},
.raw_system_interface = {},
@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
.threads = 1,
.name = "qnn-npu",
.lib = "libQnnHtp.so",
.instance = nullptr,
.backend = nullptr,
.raw_interface = {},
.raw_system_interface = {},
@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context {
ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {}
~ggml_backend_qnn_buffer_context() {
tensors.clear();
if (buffer) {
free(buffer);
}
for (auto *qnn_tensor : qnn_tensors) {
qnn::device_tensor_free(*qnn_tensor);
free(qnn_tensor);
}
qnn_tensors.clear();
}
void *buffer = nullptr;
struct ggml_backend_qnn_context *backend_ctx = nullptr;
std::list<std::unique_ptr<qnn::ggml_qnn_tensor>> tensors;
size_t buffer_size = 0;
std::vector<Qnn_Tensor_t *> qnn_tensors;
size_t device;
std::string name;
};
@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu
GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) {
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
if (!p_qnn_tensor) {
QNN_LOG_WARN("calloc failed");
return;
}
static int idx = 0;
char tensor_name[GGML_MAX_NAME] = { 0 };
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++);
Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type);
Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor);
Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
if (ctx->device == QNN_BACKEND_GPU) {
qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
}
uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2],
(uint32_t)tensor->ne[3] };
Qnn_Tensor_t qnn_tensor;
qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type,
qnn_data_type, dimensions);
Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor);
if (error != QNN_SUCCESS) {
free(p_qnn_tensor);
QNN_LOG_WARN("init tensor failed");
auto instance = ctx->backend_ctx->instance;
auto qnn_tensor = std::make_unique<qnn::ggml_qnn_tensor>(tensor, (QNNBackend)(ctx->device), instance);
if (!qnn_tensor->is_valid()) {
QNN_LOG_WARN("Create ggml_qnn_tensor failed");
return;
}
tensor->extra = p_qnn_tensor;
ctx->qnn_tensors.push_back(p_qnn_tensor);
ctx->tensors.push_back(std::move(qnn_tensor));
}
GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor,
@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context;
QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
auto *instance = g_qnn_mgr[ctx->device].instance;
if (instance != nullptr) {
for (const auto &graph_item : ctx->qnn_graph_map) {
auto instance = g_qnn_mgr[ctx->device].instance;
if (instance) {
for (const auto &graph_item : ctx->qnn_binary_graph_cache) {
QNN_LOG_INFO("graph type:%s", graph_item.first.c_str());
}
ctx->qnn_graph_map.clear();
ctx->qnn_binary_graph_cache.clear();
instance->qnn_finalize();
delete instance;
g_qnn_mgr[ctx->device].instance = nullptr;
g_qnn_mgr[ctx->device].instance.reset();
}
if (g_qnn_mgr[ctx->device].backend != nullptr) {
@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) {
}
}
auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
auto instance = std::make_shared<qnn::qnn_instance>(qnn_lib_path, g_qnn_mgr[device].lib, "");
result = instance->qnn_init(nullptr);
if (0 != result) {
if (result != 0) {
QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device));
delete instance;
return nullptr;
}
auto qnn_interface = instance->get_qnn_interface();
if (!qnn_interface.is_loaded()) {
QNN_LOG_WARN("qnn subsystem failure\n");
delete instance;
return nullptr;
}

View file

@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
return false;
}
auto *instance = ctx->instance;
auto *tensor0 = src0->extra;
auto *tensor1 = src1->extra;
auto *tensor2 = dst->extra;
auto instance = ctx->instance;
auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0);
auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1);
auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst);
if (!instance || !tensor0 || !tensor1 || !tensor2) {
QNN_LOG_WARN("invalid tensors\n");
return false;
@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
return true;
}
template <size_t _InputSize, size_t _OutputSize>
bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name,
const std::array<const ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
std::array<Qnn_Tensor_t, _InputSize> qnn_input_tensors;
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]);
if (!tensor || !tensor->bind_to_graph(*graph)) {
return false;
}
qnn_input_tensors[i] = tensor->get_qnn_tensor();
}
std::array<Qnn_Tensor_t, _OutputSize> qnn_output_tensors;
for (size_t i = 0; i < outputs.size(); ++i) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]);
if (!tensor || !tensor->bind_to_graph(*graph)) {
return false;
}
qnn_output_tensors[i] = tensor->get_qnn_tensor();
}
if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) {
return false;
}
return true;
}
template <size_t _InputSize>
bool write_to_qnn_tensors(const std::array<const ggml_tensor *, _InputSize> &inputs) {
for (auto &input : inputs) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input);
if (!tensor || !tensor->write_to_qnn_tensor()) {
return false;
}
}
return true;
}
template <size_t _OutputSize>
bool read_from_qnn_tensors(const std::array<ggml_tensor *, _OutputSize> &outputs) {
for (auto &output : outputs) {
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output);
if (!tensor || !tensor->read_from_qnn_tensor()) {
return false;
}
}
return true;
}
template <size_t _InputSize, size_t _OutputSize>
bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph,
const std::array<const ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
if (!write_to_qnn_tensors<_InputSize>(inputs)) {
return false;
}
if (!graph->execute()) {
return false;
}
if (!read_from_qnn_tensors<_OutputSize>(outputs)) {
return false;
}
return true;
}
} // namespace
#ifndef NDEBUG
@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
bool succeed = false;
std::string graph_key(ggml_op_name(GGML_OP_ADD));
auto it = ctx->qnn_graph_map.find(graph_key);
if (it != ctx->qnn_graph_map.end()) {
const auto &graph_item = it->second;
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
std::get<0>(graph_item)->execute();
auto it = ctx->qnn_binary_graph_cache.find(graph_key);
qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
if (it != ctx->qnn_binary_graph_cache.end()) {
graph_ptr = it->second.get();
} else {
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx);
if (!tensor_input0.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
if (!tensor_input1.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
if (!tensor_output.is_valid()) {
if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) {
goto failure;
}
if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD,
{ *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() },
{ *tensor_output.get_qnn_tensor() })) {
goto failure;
}
if (!graph->execute()) {
goto failure;
}
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
graph_ptr = graph.get();
ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
}
succeed = true;
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
failure:
if (!succeed) {
@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
bool succeed = false;
std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT));
auto it = ctx->qnn_graph_map.find(graph_key);
if (it != ctx->qnn_graph_map.end()) {
const auto &graph_item = it->second;
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
std::get<0>(graph_item)->execute();
auto it = ctx->qnn_binary_graph_cache.find(graph_key);
qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
if (it != ctx->qnn_binary_graph_cache.end()) {
graph_ptr = it->second.get();
} else {
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx);
if (!tensor_input0.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
if (!tensor_input1.is_valid()) {
goto failure;
}
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
if (!tensor_output.is_valid()) {
if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) {
goto failure;
}
if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() },
{ *tensor_output.get_qnn_tensor() })) {
goto failure;
}
if (!graph->execute()) {
goto failure;
}
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
graph_ptr = graph.get();
ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
}
succeed = true;
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
failure:
if (!succeed) {

View file

@ -16,12 +16,10 @@ struct ggml_backend_qnn_context {
int threads;
char name[GGML_MAX_NAME];
char lib[GGML_MAX_NAME];
qnn::qnn_instance *instance;
std::shared_ptr<qnn::qnn_instance> instance;
ggml_backend *backend;
QNN_INTERFACE_VER_TYPE raw_interface;
QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
qnn::qcom_socinfo socinfo;
std::unordered_map<std::string, std::tuple<std::unique_ptr<qnn::ggml_qnn_graph_binary>, Qnn_Tensor_t *,
Qnn_Tensor_t *, Qnn_Tensor_t *>>
qnn_graph_map;
std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> qnn_binary_graph_cache;
};

View file

@ -73,6 +73,22 @@ public:
_graph_handle = graph_handle;
}
bool create_graph_tensor(Qnn_Tensor_t &tensor) {
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph\n");
return false;
}
auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor);
if (err != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", err);
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
return false;
}
return true;
}
bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs,
const output_tensor_array_t &tensor_outputs) {
if (!is_valid()) {
@ -124,6 +140,8 @@ public:
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
const std::string &get_name() const { return _graph_name; }
private:
const std::string _graph_name;
const QNNBackend _device;

View file

@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
} // namespace qnn
#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN
#define RPCMEM_DEFAULT_FLAGS 1
#define RPCMEM_HEAP_ID_SYSTEM 25

View file

@ -637,20 +637,20 @@ public:
return 3;
}
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) {
QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
return 4;
}
int32_t mem_fd = rpcmem_to_fd(p_data);
if (-1 == mem_fd) {
if (mem_fd == -1) {
QNN_LOG_WARN("failed to get file descriptor\n");
return 5;
}
QNN_LOG_INFO("mem_fd %d\n", mem_fd);
Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions,
Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor),
nullptr },
QNN_VER_PTR(*p_tensor)->dataType,
QNN_TENSOR_GET_DATA_TYPE(*p_tensor),
QNN_MEM_TYPE_ION,
{ { mem_fd } } };
Qnn_MemHandle_t handle = nullptr;
@ -662,9 +662,10 @@ public:
strerror(error));
return 6;
} else {
QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
}
QNN_VER_PTR(*p_tensor)->memHandle = handle;
QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle);
_qnn_mem_set.insert((std::pair<void *, Qnn_MemHandle_t>(p_data, handle)));
return 0;

View file

@ -1,127 +1,197 @@
#pragma once
#include <atomic>
#include <memory>
#include <string>
#include "ggml-qnn.h"
#include "QnnTensor.h"
#include "System/QnnSystemInterface.h"
#include "backend.hpp"
#include "graph.hpp"
#include "qnn.hpp"
#include "utils.hpp"
namespace qnn {
template <Qnn_TensorType_t _tensorType>
class ggml_qnn_tensor_readwrite {
class ggml_qnn_tensor {
public:
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle,
ggml_backend_qnn_context *ctx) :
_tensor(tensor), _qnn_tensor(reinterpret_cast<Qnn_Tensor_t *>(tensor->extra)), _context(ctx) {
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
QNN_VER_PTR(*_qnn_tensor)->type = _tensorType;
if (is_npu) {
QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 };
static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) {
if (!tensor) {
return nullptr;
}
auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor);
if (err != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", err);
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
return;
}
_dimensions[0] = (uint32_t)tensor->ne[0];
_dimensions[1] = (uint32_t)tensor->ne[1];
_dimensions[2] = (uint32_t)tensor->ne[2];
_dimensions[3] = (uint32_t)tensor->ne[3];
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions;
QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor);
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type;
if (is_npu) {
auto *instance = ctx->instance;
uint8_t *qnn_buffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *)));
if (!qnn_buffer) {
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
// No free for _qnn_tensor, because it's not registered.
return;
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
instance->register_rpcmem(qnn_buffer, _qnn_tensor);
if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) {
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
}
} else {
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
}
return static_cast<ggml_qnn_tensor *>(tensor->extra);
}
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor,
ggml_backend_qnn_context *ctx) :
_tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) {
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance) :
_tensor(tensor), _device(device), _qnn_instance(qnn_instance) {
_tensor_name = ggml_get_name(tensor);
if (_tensor_name.empty()) {
static std::atomic_uint32_t unnamed_tensor_count = 0;
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++);
_tensor_name = buffer;
}
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
_dimensions[0] = (uint32_t)tensor->ne[0];
_dimensions[1] = (uint32_t)tensor->ne[1];
_dimensions[2] = (uint32_t)tensor->ne[2];
_dimensions[3] = (uint32_t)tensor->ne[3];
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions;
QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor);
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type;
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions);
QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor));
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type));
// TODO: set the quantizeParams base on the tensor type
QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor));
const bool is_npu = device == QNN_BACKEND_NPU;
if (is_npu) {
uint8_t *qnn_buffer =
static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr);
} else {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) };
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
}
tensor->extra = this;
}
template <size_t _InputSize, size_t _OutputSize>
bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
if (_graph_handle) {
if (_graph_handle != graph.get_graph_handler()) {
QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str());
return false;
} else {
QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(),
graph.get_name().c_str());
return true;
}
}
Qnn_Tensor_t tensor = _qnn_tensor;
if (!graph.create_graph_tensor(tensor)) {
QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str());
return false;
}
if (!alloc_rpc_mem()) {
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
return false;
}
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor));
_graph_handle = graph.get_graph_handler();
return true;
}
bool write_to_qnn_tensor() {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str());
return false;
}
if (should_use_mem_handle()) {
uint8_t *qnn_buffer = static_cast<uint8_t *>(
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
if (qnn_buffer) {
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor));
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
_context = nullptr;
return;
return false;
}
} else {
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
}
// For CPU and GPU, the data is already in the tensor.
return true;
}
~ggml_qnn_tensor_readwrite() {
if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context &&
_context->device == QNN_BACKEND_NPU) {
bool read_from_qnn_tensor() {
if (!is_valid()) {
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
return false;
}
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str());
return false;
}
if (should_use_mem_handle()) {
uint8_t *qnn_buffer = static_cast<uint8_t *>(
_context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
if (qnn_buffer) {
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
return false;
}
}
QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions;
// For CPU and GPU, the data is already in the tensor.
return true;
}
bool is_valid() const { return _context; }
Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; }
bool is_valid() const { return _tensor; }
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
private:
const ggml_tensor *_tensor;
Qnn_Tensor_t *_qnn_tensor;
ggml_backend_qnn_context *_context;
uint32_t *_old_dimensions;
uint32_t _dimensions[4] = {};
bool alloc_rpc_mem() {
if (!should_use_mem_handle()) {
return true;
}
ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete;
void operator=(const ggml_qnn_tensor_readwrite &) = delete;
ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete;
void operator=(ggml_qnn_tensor_readwrite &&) = delete;
uint8_t *qnn_buffer =
static_cast<uint8_t *>(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *)));
if (!qnn_buffer) {
QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno));
QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
return false;
} else {
QNN_LOG_INFO("alloc rpcmem successfully\n");
}
auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error);
QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
return false;
}
return true;
}
bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; }
const ggml_tensor *_tensor;
QNNBackend _device;
std::shared_ptr<qnn_instance> _qnn_instance;
Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT;
uint32_t _dimensions[4] = {};
std::string _tensor_name;
Qnn_GraphHandle_t _graph_handle = nullptr;
ggml_qnn_tensor(const ggml_qnn_tensor &) = delete;
void operator=(const ggml_qnn_tensor &) = delete;
ggml_qnn_tensor(ggml_qnn_tensor &&) = delete;
void operator=(ggml_qnn_tensor &&) = delete;
};
using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_READ>;
using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_WRITE>;
} // namespace qnn

View file

@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) {
return QNN_TENSORMEMTYPE_UNDEFINED;
}
inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) {
if (tensor.version == QNN_TENSOR_VERSION_1) {
return tensor.v1.memHandle;
}
return nullptr;
}
inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) {
if (tensor.version == QNN_TENSOR_VERSION_1) {
tensor.v1.id = id;
@ -224,6 +231,7 @@ public:
#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor)
#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor)
#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor)
#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor)
#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value)
#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value)