feat: add ggml_qnn_op_config for handle different op
This commit is contained in:
parent
9a5f802bb6
commit
74eb05a13b
4 changed files with 146 additions and 65 deletions
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include "graph.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
|
@ -123,40 +124,22 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
|
|||
return graph_key;
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op,
|
||||
const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
|
||||
|
||||
auto &graph_cache = ctx->qnn_graph_cache;
|
||||
const auto *op_name =
|
||||
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
|
||||
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
|
||||
auto it = graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph = std::make_unique<qnn::ggml_qnn_graph>(graph_key, (QNNBackend)(ctx->device), ctx->instance,
|
||||
ctx->socinfo.vtcm_size_in_mb);
|
||||
|
||||
if (!graph->is_valid()) {
|
||||
return nullptr;
|
||||
qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) {
|
||||
if (op_name == QNN_OP_MAT_MUL) {
|
||||
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
|
||||
return [](const std::string &name) {
|
||||
auto config = std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL);
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_BOOL_8;
|
||||
scalar.bool8Value = true;
|
||||
config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar);
|
||||
return config;
|
||||
};
|
||||
}
|
||||
|
||||
if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<_OutputSize>(outputs))) {
|
||||
QNN_LOG_ERROR("build_graph failed\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
graph_ptr = graph.get();
|
||||
graph_cache[graph_key] = std::move(graph);
|
||||
}
|
||||
|
||||
return graph_ptr;
|
||||
return [op_name](const std::string &name) {
|
||||
return std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name);
|
||||
};
|
||||
}
|
||||
|
||||
constexpr const char *kGgmlOpToQnnOp[] = {
|
||||
|
@ -264,6 +247,42 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU
|
|||
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
|
||||
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op,
|
||||
const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
|
||||
|
||||
auto &graph_cache = ctx->qnn_graph_cache;
|
||||
const auto *op_name =
|
||||
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
|
||||
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
|
||||
auto it = graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph = std::make_unique<qnn::ggml_qnn_graph>(graph_key, (QNNBackend)(ctx->device), ctx->instance,
|
||||
ctx->socinfo.vtcm_size_in_mb);
|
||||
if (!graph->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<_OutputSize>(outputs))) {
|
||||
QNN_LOG_ERROR("build_graph failed\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
graph_ptr = graph.get();
|
||||
graph_cache[graph_key] = std::move(graph);
|
||||
}
|
||||
|
||||
return graph_ptr;
|
||||
}
|
||||
|
||||
template <ggml_op _GgmlOp>
|
||||
bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
|
||||
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
|
||||
|
@ -271,7 +290,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t
|
|||
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst });
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst });
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
|
||||
}
|
||||
|
@ -292,7 +311,7 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten
|
|||
CHECK_PARAMS(ctx, src, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst });
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst });
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
|
||||
}
|
||||
|
@ -305,7 +324,6 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten
|
|||
return succeed;
|
||||
}
|
||||
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
||||
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
nullptr, // GGML_OP_ADD
|
||||
|
|
|
@ -2,18 +2,22 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
using ggml_op_constructor_t = std::function<std::unique_ptr<qnn::ggml_qnn_op_config>(const std::string &)>;
|
||||
|
||||
class ggml_qnn_graph {
|
||||
public:
|
||||
|
@ -79,15 +83,15 @@ public:
|
|||
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); }
|
||||
|
||||
bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs,
|
||||
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(op_constructor);
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_ERROR("Invalid graph\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str());
|
||||
_qnn_tensor_inputs.resize(tensor_inputs.size());
|
||||
_tensor_inputs.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); i++) {
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
|
@ -100,11 +104,9 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
_qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor();
|
||||
_tensor_inputs[i] = qnn_tensor;
|
||||
}
|
||||
|
||||
_qnn_tensor_outputs.resize(tensor_outputs.size());
|
||||
_tensor_outputs.resize(tensor_outputs.size());
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
|
@ -117,23 +119,13 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
_qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor();
|
||||
_tensor_outputs[i] = qnn_tensor;
|
||||
}
|
||||
|
||||
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
|
||||
config.version = QNN_OPCONFIG_VERSION_1;
|
||||
auto &op_config = config.v1;
|
||||
op_config.name = _graph_name.c_str();
|
||||
op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW;
|
||||
op_config.typeName = op_name.c_str();
|
||||
op_config.numOfParams = (uint32_t)_param_types.size();
|
||||
op_config.params = _param_types.data();
|
||||
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
|
||||
op_config.inputTensors = _qnn_tensor_inputs.data();
|
||||
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
|
||||
op_config.outputTensors = _qnn_tensor_outputs.data();
|
||||
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config);
|
||||
_op_config = op_constructor(_graph_name);
|
||||
_op_config->set_input_tensors(_tensor_inputs);
|
||||
_op_config->set_output_tensors(_tensor_outputs);
|
||||
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config());
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
|
@ -168,8 +160,6 @@ public:
|
|||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
_qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
|
@ -178,13 +168,16 @@ public:
|
|||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
_qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
_op_config->set_input_tensors(_tensor_inputs);
|
||||
_op_config->set_output_tensors(_tensor_outputs);
|
||||
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
|
||||
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
|
||||
|
||||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(),
|
||||
_qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
if (_device == QNN_BACKEND_NPU) {
|
||||
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
|
@ -219,10 +212,9 @@ private:
|
|||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<qnn_interface> _qnn_interface;
|
||||
std::vector<std::shared_ptr<qnn::ggml_qnn_tensor>> _tensor_inputs;
|
||||
std::vector<std::shared_ptr<qnn::ggml_qnn_tensor>> _tensor_outputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_inputs;
|
||||
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_outputs;
|
||||
std::unique_ptr<ggml_qnn_op_config> _op_config;
|
||||
std::vector<Qnn_Param_t> _param_types;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_graph);
|
||||
|
|
73
ggml/src/ggml-qnn/op-config.hpp
Normal file
73
ggml/src/ggml-qnn/op-config.hpp
Normal file
|
@ -0,0 +1,73 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
class ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) :
|
||||
_name(name), _package_name(package_name), _op_type(op_type) {}
|
||||
|
||||
void set_input_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_inputs) {
|
||||
_qnn_tensor_inputs.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); i++) {
|
||||
_qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor();
|
||||
}
|
||||
}
|
||||
|
||||
void set_output_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_outputs) {
|
||||
_qnn_tensor_outputs.resize(tensor_outputs.size());
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
_qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor();
|
||||
}
|
||||
}
|
||||
|
||||
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
|
||||
_param_names.push_back(name);
|
||||
Qnn_Param_t param = QNN_PARAM_INIT;
|
||||
param.paramType = QNN_PARAMTYPE_SCALAR;
|
||||
param.name = _param_names.back().c_str();
|
||||
param.scalarParam = scalar;
|
||||
_param_types.push_back(param);
|
||||
}
|
||||
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() { return _qnn_tensor_outputs; }
|
||||
|
||||
Qnn_OpConfig_t get_op_config() {
|
||||
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
|
||||
config.version = QNN_OPCONFIG_VERSION_1;
|
||||
auto &op_config = config.v1;
|
||||
op_config.name = _name.c_str();
|
||||
op_config.packageName = _package_name.c_str();
|
||||
op_config.typeName = _op_type.c_str();
|
||||
op_config.numOfParams = (uint32_t)_param_types.size();
|
||||
op_config.params = _param_types.data();
|
||||
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
|
||||
op_config.inputTensors = _qnn_tensor_inputs.data();
|
||||
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
|
||||
op_config.outputTensors = _qnn_tensor_outputs.data();
|
||||
return config;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string _name;
|
||||
std::string _package_name;
|
||||
std::string _op_type;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<Qnn_Param_t> _param_types;
|
||||
std::vector<std::string> _param_names;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_op_config);
|
||||
};
|
||||
} // namespace qnn
|
|
@ -8,8 +8,6 @@
|
|||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "QnnTensor.h"
|
||||
#include "System/QnnSystemInterface.h"
|
||||
#include "buffer.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue