refactoring ggml_qnn_tensor
This commit is contained in:
parent
874216b9c8
commit
5f2e3918f6
8 changed files with 301 additions and 212 deletions
|
@ -1,5 +1,6 @@
|
|||
#include "ggml-qnn.h"
|
||||
|
||||
#include <list>
|
||||
#include <stdatomic.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -81,7 +82,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
|
|||
.threads = 1,
|
||||
.name = "qnn-cpu",
|
||||
.lib = "libQnnCpu.so",
|
||||
.instance = nullptr,
|
||||
.backend = nullptr,
|
||||
.raw_interface = {},
|
||||
.raw_system_interface = {},
|
||||
|
@ -91,7 +91,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
|
|||
.threads = 1,
|
||||
.name = "qnn-gpu",
|
||||
.lib = "libQnnGpu.so",
|
||||
.instance = nullptr,
|
||||
.backend = nullptr,
|
||||
.raw_interface = {},
|
||||
.raw_system_interface = {},
|
||||
|
@ -101,7 +100,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
|
|||
.threads = 1,
|
||||
.name = "qnn-npu",
|
||||
.lib = "libQnnHtp.so",
|
||||
.instance = nullptr,
|
||||
.backend = nullptr,
|
||||
.raw_interface = {},
|
||||
.raw_system_interface = {},
|
||||
|
@ -112,23 +110,16 @@ struct ggml_backend_qnn_buffer_context {
|
|||
ggml_backend_qnn_buffer_context(size_t device) : device(device), name(QNN_BACKEND_NAME + std::to_string(device)) {}
|
||||
|
||||
~ggml_backend_qnn_buffer_context() {
|
||||
tensors.clear();
|
||||
if (buffer) {
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
for (auto *qnn_tensor : qnn_tensors) {
|
||||
qnn::device_tensor_free(*qnn_tensor);
|
||||
free(qnn_tensor);
|
||||
}
|
||||
|
||||
qnn_tensors.clear();
|
||||
}
|
||||
|
||||
void *buffer = nullptr;
|
||||
|
||||
struct ggml_backend_qnn_context *backend_ctx = nullptr;
|
||||
|
||||
std::list<std::unique_ptr<qnn::ggml_qnn_tensor>> tensors;
|
||||
size_t buffer_size = 0;
|
||||
std::vector<Qnn_Tensor_t *> qnn_tensors;
|
||||
size_t device;
|
||||
std::string name;
|
||||
};
|
||||
|
@ -235,37 +226,14 @@ GGML_CALL static void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t bu
|
|||
GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) {
|
||||
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
|
||||
|
||||
Qnn_Tensor_t *p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
|
||||
if (!p_qnn_tensor) {
|
||||
QNN_LOG_WARN("calloc failed");
|
||||
return;
|
||||
}
|
||||
|
||||
static int idx = 0;
|
||||
char tensor_name[GGML_MAX_NAME] = { 0 };
|
||||
snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++);
|
||||
Qnn_DataType_t qnn_data_type = qnn::device_datatype_from_ggml_datatype(tensor->type);
|
||||
Qnn_TensorType_t qnn_tensor_type = qnn::device_tensortype_from_ggml_tensor(tensor);
|
||||
Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW;
|
||||
if (ctx->device == QNN_BACKEND_GPU) {
|
||||
qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
}
|
||||
|
||||
uint32_t dimensions[] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2],
|
||||
(uint32_t)tensor->ne[3] };
|
||||
Qnn_Tensor_t qnn_tensor;
|
||||
qnn::device_tensor_init(qnn_tensor, qnn::get_ggml_tensor_rank(tensor), qnn_mem_type, tensor_name, qnn_tensor_type,
|
||||
qnn_data_type, dimensions);
|
||||
|
||||
Qnn_ErrorHandle_t error = qnn::device_tensor_deep_copy(qnn_tensor, *p_qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
free(p_qnn_tensor);
|
||||
QNN_LOG_WARN("init tensor failed");
|
||||
auto instance = ctx->backend_ctx->instance;
|
||||
auto qnn_tensor = std::make_unique<qnn::ggml_qnn_tensor>(tensor, (QNNBackend)(ctx->device), instance);
|
||||
if (!qnn_tensor->is_valid()) {
|
||||
QNN_LOG_WARN("Create ggml_qnn_tensor failed");
|
||||
return;
|
||||
}
|
||||
|
||||
tensor->extra = p_qnn_tensor;
|
||||
ctx->qnn_tensors.push_back(p_qnn_tensor);
|
||||
ctx->tensors.push_back(std::move(qnn_tensor));
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor,
|
||||
|
@ -373,17 +341,16 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) {
|
|||
ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *)backend->context;
|
||||
QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
|
||||
|
||||
auto *instance = g_qnn_mgr[ctx->device].instance;
|
||||
if (instance != nullptr) {
|
||||
for (const auto &graph_item : ctx->qnn_graph_map) {
|
||||
auto instance = g_qnn_mgr[ctx->device].instance;
|
||||
if (instance) {
|
||||
for (const auto &graph_item : ctx->qnn_binary_graph_cache) {
|
||||
QNN_LOG_INFO("graph type:%s", graph_item.first.c_str());
|
||||
}
|
||||
|
||||
ctx->qnn_graph_map.clear();
|
||||
ctx->qnn_binary_graph_cache.clear();
|
||||
|
||||
instance->qnn_finalize();
|
||||
delete instance;
|
||||
g_qnn_mgr[ctx->device].instance = nullptr;
|
||||
g_qnn_mgr[ctx->device].instance.reset();
|
||||
}
|
||||
|
||||
if (g_qnn_mgr[ctx->device].backend != nullptr) {
|
||||
|
@ -582,17 +549,15 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char *qnn_lib_path) {
|
|||
}
|
||||
}
|
||||
|
||||
auto *instance = new qnn::qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
|
||||
auto instance = std::make_shared<qnn::qnn_instance>(qnn_lib_path, g_qnn_mgr[device].lib, "");
|
||||
result = instance->qnn_init(nullptr);
|
||||
if (0 != result) {
|
||||
if (result != 0) {
|
||||
QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device));
|
||||
delete instance;
|
||||
return nullptr;
|
||||
}
|
||||
auto qnn_interface = instance->get_qnn_interface();
|
||||
if (!qnn_interface.is_loaded()) {
|
||||
QNN_LOG_WARN("qnn subsystem failure\n");
|
||||
delete instance;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,10 +23,10 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
|
|||
return false;
|
||||
}
|
||||
|
||||
auto *instance = ctx->instance;
|
||||
auto *tensor0 = src0->extra;
|
||||
auto *tensor1 = src1->extra;
|
||||
auto *tensor2 = dst->extra;
|
||||
auto instance = ctx->instance;
|
||||
auto *tensor0 = qnn::ggml_qnn_tensor::from_ggml_tensor(src0);
|
||||
auto *tensor1 = qnn::ggml_qnn_tensor::from_ggml_tensor(src1);
|
||||
auto *tensor2 = qnn::ggml_qnn_tensor::from_ggml_tensor(dst);
|
||||
if (!instance || !tensor0 || !tensor1 || !tensor2) {
|
||||
QNN_LOG_WARN("invalid tensors\n");
|
||||
return false;
|
||||
|
@ -35,6 +35,80 @@ bool qnn_is_valid_params(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
|
|||
return true;
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
bool qnn_bind_tensors_to_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph, const std::string &op_name,
|
||||
const std::array<const ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
std::array<Qnn_Tensor_t, _InputSize> qnn_input_tensors;
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(inputs[i]);
|
||||
if (!tensor || !tensor->bind_to_graph(*graph)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_input_tensors[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
std::array<Qnn_Tensor_t, _OutputSize> qnn_output_tensors;
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(outputs[i]);
|
||||
if (!tensor || !tensor->bind_to_graph(*graph)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_output_tensors[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
if (!graph->add_nodes(op_name, qnn_input_tensors, qnn_output_tensors)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <size_t _InputSize>
|
||||
bool write_to_qnn_tensors(const std::array<const ggml_tensor *, _InputSize> &inputs) {
|
||||
for (auto &input : inputs) {
|
||||
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(input);
|
||||
if (!tensor || !tensor->write_to_qnn_tensor()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <size_t _OutputSize>
|
||||
bool read_from_qnn_tensors(const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
for (auto &output : outputs) {
|
||||
auto tensor = qnn::ggml_qnn_tensor::from_ggml_tensor(output);
|
||||
if (!tensor || !tensor->read_from_qnn_tensor()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
bool execute_graph(qnn::ggml_qnn_graph<_InputSize, _OutputSize> *graph,
|
||||
const std::array<const ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
if (!write_to_qnn_tensors<_InputSize>(inputs)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!graph->execute()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!read_from_qnn_tensors<_OutputSize>(outputs)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -61,13 +135,10 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
|
|||
|
||||
bool succeed = false;
|
||||
std::string graph_key(ggml_op_name(GGML_OP_ADD));
|
||||
auto it = ctx->qnn_graph_map.find(graph_key);
|
||||
if (it != ctx->qnn_graph_map.end()) {
|
||||
const auto &graph_item = it->second;
|
||||
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
|
||||
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
|
||||
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
|
||||
std::get<0>(graph_item)->execute();
|
||||
auto it = ctx->qnn_binary_graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
|
||||
if (it != ctx->qnn_binary_graph_cache.end()) {
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
|
||||
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
|
||||
|
@ -78,34 +149,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context *ctx, const ggml_tensor *src0,
|
|||
goto failure;
|
||||
}
|
||||
|
||||
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_input0.is_valid()) {
|
||||
goto failure;
|
||||
}
|
||||
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_input1.is_valid()) {
|
||||
goto failure;
|
||||
}
|
||||
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_output.is_valid()) {
|
||||
if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_ELEMENT_WISE_ADD, { src0, src1 }, { dst })) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (!graph->add_nodes(QNN_OP_ELEMENT_WISE_ADD,
|
||||
{ *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() },
|
||||
{ *tensor_output.get_qnn_tensor() })) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (!graph->execute()) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
|
||||
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
|
||||
graph_ptr = graph.get();
|
||||
ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
|
||||
}
|
||||
|
||||
succeed = true;
|
||||
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
|
||||
|
||||
failure:
|
||||
if (!succeed) {
|
||||
|
@ -143,13 +195,10 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
|
|||
|
||||
bool succeed = false;
|
||||
std::string graph_key(ggml_op_name(GGML_OP_MUL_MAT));
|
||||
auto it = ctx->qnn_graph_map.find(graph_key);
|
||||
if (it != ctx->qnn_graph_map.end()) {
|
||||
const auto &graph_item = it->second;
|
||||
qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx);
|
||||
qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx);
|
||||
qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx);
|
||||
std::get<0>(graph_item)->execute();
|
||||
auto it = ctx->qnn_binary_graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph_binary *graph_ptr = nullptr;
|
||||
if (it != ctx->qnn_binary_graph_cache.end()) {
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name;
|
||||
auto graph = std::make_unique<qnn::ggml_qnn_graph_binary>(graph_name, (QNNBackend)(ctx->device),
|
||||
|
@ -160,33 +209,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context *ctx, const ggml_tensor *s
|
|||
goto failure;
|
||||
}
|
||||
|
||||
qnn::ggml_qnn_tensor_input tensor_input0(src0, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_input0.is_valid()) {
|
||||
goto failure;
|
||||
}
|
||||
qnn::ggml_qnn_tensor_input tensor_input1(src1, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_input1.is_valid()) {
|
||||
goto failure;
|
||||
}
|
||||
qnn::ggml_qnn_tensor_output tensor_output(dst, graph->get_graph_handler(), ctx);
|
||||
if (!tensor_output.is_valid()) {
|
||||
if (!qnn_bind_tensors_to_graph<2, 1>(graph.get(), QNN_OP_MAT_MUL, { src0, src1 }, { dst })) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (!graph->add_nodes(QNN_OP_MAT_MUL, { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() },
|
||||
{ *tensor_output.get_qnn_tensor() })) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
if (!graph->execute()) {
|
||||
goto failure;
|
||||
}
|
||||
|
||||
ctx->qnn_graph_map[graph_key] = std::make_tuple(std::move(graph), tensor_input0.get_qnn_tensor(),
|
||||
tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor());
|
||||
graph_ptr = graph.get();
|
||||
ctx->qnn_binary_graph_cache[graph_key] = std::move(graph);
|
||||
}
|
||||
|
||||
succeed = true;
|
||||
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
|
||||
|
||||
failure:
|
||||
if (!succeed) {
|
||||
|
|
|
@ -16,12 +16,10 @@ struct ggml_backend_qnn_context {
|
|||
int threads;
|
||||
char name[GGML_MAX_NAME];
|
||||
char lib[GGML_MAX_NAME];
|
||||
qnn::qnn_instance *instance;
|
||||
std::shared_ptr<qnn::qnn_instance> instance;
|
||||
ggml_backend *backend;
|
||||
QNN_INTERFACE_VER_TYPE raw_interface;
|
||||
QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
|
||||
qnn::qcom_socinfo socinfo;
|
||||
std::unordered_map<std::string, std::tuple<std::unique_ptr<qnn::ggml_qnn_graph_binary>, Qnn_Tensor_t *,
|
||||
Qnn_Tensor_t *, Qnn_Tensor_t *>>
|
||||
qnn_graph_map;
|
||||
std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph_binary>> qnn_binary_graph_cache;
|
||||
};
|
||||
|
|
|
@ -73,6 +73,22 @@ public:
|
|||
_graph_handle = graph_handle;
|
||||
}
|
||||
|
||||
bool create_graph_tensor(Qnn_Tensor_t &tensor) {
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_ERROR("Invalid graph\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto err = _qnn_interface.tensorCreateGraphTensor(_graph_handle, &tensor);
|
||||
if (err != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("error = %d\n", err);
|
||||
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool add_nodes(const std::string &op_name, const input_tensor_array_t &tensor_inputs,
|
||||
const output_tensor_array_t &tensor_outputs) {
|
||||
if (!is_valid()) {
|
||||
|
@ -124,6 +140,8 @@ public:
|
|||
|
||||
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
|
||||
|
||||
const std::string &get_name() const { return _graph_name; }
|
||||
|
||||
private:
|
||||
const std::string _graph_name;
|
||||
const QNNBackend _device;
|
||||
|
|
|
@ -49,7 +49,5 @@ using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
|
|||
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
|
||||
} // namespace qnn
|
||||
|
||||
#define QNN_VER_PTR(x) (&((x).v1)) // TODO: remove this macro after we have a separate header for QNN
|
||||
|
||||
#define RPCMEM_DEFAULT_FLAGS 1
|
||||
#define RPCMEM_HEAP_ID_SYSTEM 25
|
||||
|
|
|
@ -637,20 +637,20 @@ public:
|
|||
return 3;
|
||||
}
|
||||
|
||||
if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
|
||||
QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
|
||||
if (is_rpcmem_registered(QNN_TENSOR_GET_MEM_HANDLE(*p_tensor))) {
|
||||
QNN_LOG_WARN("tensor %s has been registered shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
|
||||
return 4;
|
||||
}
|
||||
|
||||
int32_t mem_fd = rpcmem_to_fd(p_data);
|
||||
if (-1 == mem_fd) {
|
||||
if (mem_fd == -1) {
|
||||
QNN_LOG_WARN("failed to get file descriptor\n");
|
||||
return 5;
|
||||
}
|
||||
QNN_LOG_INFO("mem_fd %d\n", mem_fd);
|
||||
Qnn_MemDescriptor_t descriptor = { { QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions,
|
||||
Qnn_MemDescriptor_t descriptor = { { QNN_TENSOR_GET_RANK(*p_tensor), QNN_TENSOR_GET_DIMENSIONS(*p_tensor),
|
||||
nullptr },
|
||||
QNN_VER_PTR(*p_tensor)->dataType,
|
||||
QNN_TENSOR_GET_DATA_TYPE(*p_tensor),
|
||||
QNN_MEM_TYPE_ION,
|
||||
{ { mem_fd } } };
|
||||
Qnn_MemHandle_t handle = nullptr;
|
||||
|
@ -662,9 +662,10 @@ public:
|
|||
strerror(error));
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
|
||||
QNN_LOG_INFO("tensor %s successfully register shared memory\n", QNN_TENSOR_GET_NAME(*p_tensor));
|
||||
}
|
||||
QNN_VER_PTR(*p_tensor)->memHandle = handle;
|
||||
|
||||
QNN_TENSOR_SET_MEM_HANDLE(*p_tensor, handle);
|
||||
_qnn_mem_set.insert((std::pair<void *, Qnn_MemHandle_t>(p_data, handle)));
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -1,127 +1,197 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "QnnTensor.h"
|
||||
#include "System/QnnSystemInterface.h"
|
||||
#include "backend.hpp"
|
||||
#include "graph.hpp"
|
||||
#include "qnn.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
template <Qnn_TensorType_t _tensorType>
|
||||
class ggml_qnn_tensor_readwrite {
|
||||
class ggml_qnn_tensor {
|
||||
public:
|
||||
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_GraphHandle_t graph_handle,
|
||||
ggml_backend_qnn_context *ctx) :
|
||||
_tensor(tensor), _qnn_tensor(reinterpret_cast<Qnn_Tensor_t *>(tensor->extra)), _context(ctx) {
|
||||
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
|
||||
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
|
||||
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
|
||||
QNN_VER_PTR(*_qnn_tensor)->type = _tensorType;
|
||||
if (is_npu) {
|
||||
QNN_VER_PTR(*_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
|
||||
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { .data = nullptr, .dataSize = 0 };
|
||||
static ggml_qnn_tensor *from_ggml_tensor(const ggml_tensor *tensor) {
|
||||
if (!tensor) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto err = ctx->raw_interface.tensorCreateGraphTensor(graph_handle, _qnn_tensor);
|
||||
if (err != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("error = %d\n", err);
|
||||
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
|
||||
_context = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
_dimensions[0] = (uint32_t)tensor->ne[0];
|
||||
_dimensions[1] = (uint32_t)tensor->ne[1];
|
||||
_dimensions[2] = (uint32_t)tensor->ne[2];
|
||||
_dimensions[3] = (uint32_t)tensor->ne[3];
|
||||
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions;
|
||||
QNN_VER_PTR(*_qnn_tensor)->rank = qnn::get_ggml_tensor_rank(tensor);
|
||||
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type;
|
||||
|
||||
if (is_npu) {
|
||||
auto *instance = ctx->instance;
|
||||
uint8_t *qnn_buffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(tensor), alignof(void *)));
|
||||
if (!qnn_buffer) {
|
||||
QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
|
||||
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
|
||||
_context = nullptr;
|
||||
// No free for _qnn_tensor, because it's not registered.
|
||||
return;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
|
||||
instance->register_rpcmem(qnn_buffer, _qnn_tensor);
|
||||
if (_tensorType == QNN_TENSOR_TYPE_APP_WRITE || _tensorType == QNN_TENSOR_TYPE_APP_READWRITE) {
|
||||
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
} else {
|
||||
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
|
||||
}
|
||||
return static_cast<ggml_qnn_tensor *>(tensor->extra);
|
||||
}
|
||||
|
||||
explicit ggml_qnn_tensor_readwrite(const ggml_tensor *tensor, Qnn_Tensor_t *qnn_tensor,
|
||||
ggml_backend_qnn_context *ctx) :
|
||||
_tensor(tensor), _qnn_tensor(qnn_tensor), _context(ctx) {
|
||||
_old_dimensions = QNN_VER_PTR(*_qnn_tensor)->dimensions;
|
||||
const auto qnn_data_type = device_datatype_from_ggml_datatype(tensor->type);
|
||||
const bool is_npu = ctx->device == QNN_BACKEND_NPU;
|
||||
explicit ggml_qnn_tensor(ggml_tensor *tensor, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_tensor(tensor), _device(device), _qnn_instance(qnn_instance) {
|
||||
_tensor_name = ggml_get_name(tensor);
|
||||
if (_tensor_name.empty()) {
|
||||
static std::atomic_uint32_t unnamed_tensor_count = 0;
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
snprintf(buffer, sizeof(buffer), "unnamed_%p", unnamed_tensor_count++);
|
||||
_tensor_name = buffer;
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
|
||||
_dimensions[0] = (uint32_t)tensor->ne[0];
|
||||
_dimensions[1] = (uint32_t)tensor->ne[1];
|
||||
_dimensions[2] = (uint32_t)tensor->ne[2];
|
||||
_dimensions[3] = (uint32_t)tensor->ne[3];
|
||||
QNN_VER_PTR(*_qnn_tensor)->dimensions = _dimensions;
|
||||
QNN_VER_PTR(*_qnn_tensor)->rank = get_ggml_tensor_rank(tensor);
|
||||
QNN_VER_PTR(*_qnn_tensor)->dataType = qnn_data_type;
|
||||
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions);
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, device_tensortype_from_ggml_tensor(tensor));
|
||||
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
|
||||
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type));
|
||||
// TODO: set the quantizeParams base on the tensor type
|
||||
QNN_TENSOR_SET_RANK(_qnn_tensor, qnn::get_ggml_tensor_rank(tensor));
|
||||
|
||||
const bool is_npu = device == QNN_BACKEND_NPU;
|
||||
if (is_npu) {
|
||||
uint8_t *qnn_buffer =
|
||||
static_cast<uint8_t *>(ctx->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
|
||||
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, nullptr);
|
||||
} else {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) };
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
}
|
||||
|
||||
tensor->extra = this;
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
bool bind_to_graph(ggml_qnn_graph<_InputSize, _OutputSize> &graph) {
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (_graph_handle) {
|
||||
if (_graph_handle != graph.get_graph_handler()) {
|
||||
QNN_LOG_WARN("tensor %s has been bound to another graph", _tensor_name.c_str());
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_INFO("tensor %s already bound to same graph %s", _tensor_name.c_str(),
|
||||
graph.get_name().c_str());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Qnn_Tensor_t tensor = _qnn_tensor;
|
||||
if (!graph.create_graph_tensor(tensor)) {
|
||||
QNN_LOG_WARN("create graph tensor failed, tensor %s", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!alloc_rpc_mem()) {
|
||||
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(tensor));
|
||||
_graph_handle = graph.get_graph_handler();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool write_to_qnn_tensor() {
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
|
||||
if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
|
||||
QNN_LOG_WARN("tensor %s not writable", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
uint8_t *qnn_buffer = static_cast<uint8_t *>(
|
||||
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
|
||||
if (qnn_buffer) {
|
||||
memcpy(qnn_buffer, tensor->data, ggml_nbytes(tensor));
|
||||
memcpy(qnn_buffer, _tensor->data, ggml_nbytes(_tensor));
|
||||
} else {
|
||||
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
|
||||
QNN_LOG_DEBUG("tensor%p name %s", _qnn_tensor, QNN_TENSOR_GET_NAME(*_qnn_tensor));
|
||||
_context = nullptr;
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
QNN_VER_PTR(*_qnn_tensor)->clientBuf = { tensor->data, get_ggml_tensor_data_size(tensor) };
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
return true;
|
||||
}
|
||||
|
||||
~ggml_qnn_tensor_readwrite() {
|
||||
if ((_tensorType == QNN_TENSOR_TYPE_APP_READWRITE || _tensorType == QNN_TENSOR_TYPE_APP_READ) && _context &&
|
||||
_context->device == QNN_BACKEND_NPU) {
|
||||
bool read_from_qnn_tensor() {
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_WARN("tensor %s not valid", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
|
||||
if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
|
||||
QNN_LOG_WARN("tensor %s not readable", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
uint8_t *qnn_buffer = static_cast<uint8_t *>(
|
||||
_context->instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*_qnn_tensor)->memHandle));
|
||||
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
|
||||
_qnn_instance->get_rpcmem_from_memhandle(QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)));
|
||||
if (qnn_buffer) {
|
||||
memcpy(_tensor->data, qnn_buffer, ggml_nbytes(_tensor));
|
||||
} else {
|
||||
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
QNN_VER_PTR(*_qnn_tensor)->dimensions = _old_dimensions;
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_valid() const { return _context; }
|
||||
Qnn_Tensor_t *get_qnn_tensor() const { return _qnn_tensor; }
|
||||
bool is_valid() const { return _tensor; }
|
||||
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
|
||||
|
||||
private:
|
||||
const ggml_tensor *_tensor;
|
||||
Qnn_Tensor_t *_qnn_tensor;
|
||||
ggml_backend_qnn_context *_context;
|
||||
uint32_t *_old_dimensions;
|
||||
uint32_t _dimensions[4] = {};
|
||||
bool alloc_rpc_mem() {
|
||||
if (!should_use_mem_handle()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_qnn_tensor_readwrite(const ggml_qnn_tensor_readwrite &) = delete;
|
||||
void operator=(const ggml_qnn_tensor_readwrite &) = delete;
|
||||
ggml_qnn_tensor_readwrite(ggml_qnn_tensor_readwrite &&) = delete;
|
||||
void operator=(ggml_qnn_tensor_readwrite &&) = delete;
|
||||
uint8_t *qnn_buffer =
|
||||
static_cast<uint8_t *>(_qnn_instance->alloc_rpcmem(ggml_nbytes(_tensor), alignof(void *)));
|
||||
if (!qnn_buffer) {
|
||||
QNN_LOG_WARN("alloc rpc mem failure, %s\n", strerror(errno));
|
||||
QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_INFO("alloc rpcmem successfully\n");
|
||||
}
|
||||
|
||||
auto error = _qnn_instance->register_rpcmem(qnn_buffer, &_qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("register rpc mem failure, %d\n", (int)error);
|
||||
QNN_LOG_DEBUG("tensor name %s", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; }
|
||||
|
||||
const ggml_tensor *_tensor;
|
||||
QNNBackend _device;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = QNN_TENSOR_INIT;
|
||||
uint32_t _dimensions[4] = {};
|
||||
std::string _tensor_name;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
|
||||
ggml_qnn_tensor(const ggml_qnn_tensor &) = delete;
|
||||
void operator=(const ggml_qnn_tensor &) = delete;
|
||||
ggml_qnn_tensor(ggml_qnn_tensor &&) = delete;
|
||||
void operator=(ggml_qnn_tensor &&) = delete;
|
||||
};
|
||||
|
||||
using ggml_qnn_tensor_output = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_READ>;
|
||||
using ggml_qnn_tensor_input = ggml_qnn_tensor_readwrite<QNN_TENSOR_TYPE_APP_WRITE>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
@ -102,6 +102,13 @@ inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t &tensor) {
|
|||
return QNN_TENSORMEMTYPE_UNDEFINED;
|
||||
}
|
||||
|
||||
inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t &tensor) {
|
||||
if (tensor.version == QNN_TENSOR_VERSION_1) {
|
||||
return tensor.v1.memHandle;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline void set_qnn_tensor_id(Qnn_Tensor_t &tensor, uint32_t id) {
|
||||
if (tensor.version == QNN_TENSOR_VERSION_1) {
|
||||
tensor.v1.id = id;
|
||||
|
@ -224,6 +231,7 @@ public:
|
|||
#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor)
|
||||
#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor)
|
||||
#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor)
|
||||
#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor)
|
||||
|
||||
#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value)
|
||||
#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue