feat: add ggml_qnn_op_config for handle different op

This commit is contained in:
hongruichen 2024-07-29 23:12:51 +08:00
parent 9a5f802bb6
commit 74eb05a13b
4 changed files with 146 additions and 65 deletions

View file

@ -5,6 +5,7 @@
#include "graph.hpp"
#include "logger.hpp"
#include "op-config.hpp"
#include "tensor.hpp"
#include "utils.hpp"
@ -123,40 +124,22 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
return graph_key;
}
template <size_t _InputSize, size_t _OutputSize>
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op,
const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
auto &graph_cache = ctx->qnn_graph_cache;
const auto *op_name =
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
auto it = graph_cache.find(graph_key);
qnn::ggml_qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) {
QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str());
graph_ptr = it->second.get();
} else {
auto graph = std::make_unique<qnn::ggml_qnn_graph>(graph_key, (QNNBackend)(ctx->device), ctx->instance,
ctx->socinfo.vtcm_size_in_mb);
if (!graph->is_valid()) {
return nullptr;
qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) {
if (op_name == QNN_OP_MAT_MUL) {
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
return [](const std::string &name) {
auto config = std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_BOOL_8;
scalar.bool8Value = true;
config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar);
return config;
};
}
if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) {
QNN_LOG_ERROR("build_graph failed\n");
return nullptr;
}
graph_ptr = graph.get();
graph_cache[graph_key] = std::move(graph);
}
return graph_ptr;
return [op_name](const std::string &name) {
return std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name);
};
}
constexpr const char *kGgmlOpToQnnOp[] = {
@ -264,6 +247,42 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
template <size_t _InputSize, size_t _OutputSize>
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op,
const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
auto &graph_cache = ctx->qnn_graph_cache;
const auto *op_name =
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
auto it = graph_cache.find(graph_key);
qnn::ggml_qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) {
QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str());
graph_ptr = it->second.get();
} else {
auto graph = std::make_unique<qnn::ggml_qnn_graph>(graph_key, (QNNBackend)(ctx->device), ctx->instance,
ctx->socinfo.vtcm_size_in_mb);
if (!graph->is_valid()) {
return nullptr;
}
auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) {
QNN_LOG_ERROR("build_graph failed\n");
return nullptr;
}
graph_ptr = graph.get();
graph_cache[graph_key] = std::move(graph);
}
return graph_ptr;
}
template <ggml_op _GgmlOp>
bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
@ -271,7 +290,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t
CHECK_PARAMS(ctx, src0, src1, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst });
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst });
if (graph_ptr) {
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
}
@ -292,7 +311,7 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten
CHECK_PARAMS(ctx, src, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst });
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst });
if (graph_ptr) {
succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
}
@ -305,7 +324,6 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten
return succeed;
}
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
nullptr, // GGML_OP_ADD

View file

@ -2,18 +2,22 @@
#pragma once
#include <cstdio>
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "ggml-qnn.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "qnn-lib.hpp"
#include "tensor.hpp"
namespace qnn {
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
using ggml_op_constructor_t = std::function<std::unique_ptr<qnn::ggml_qnn_op_config>(const std::string &)>;
class ggml_qnn_graph {
public:
@ -79,15 +83,15 @@ public:
~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); }
bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs,
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(op_constructor);
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph\n");
return false;
}
QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str());
_qnn_tensor_inputs.resize(tensor_inputs.size());
_tensor_inputs.resize(tensor_inputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
@ -100,11 +104,9 @@ public:
return false;
}
_qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor();
_tensor_inputs[i] = qnn_tensor;
}
_qnn_tensor_outputs.resize(tensor_outputs.size());
_tensor_outputs.resize(tensor_outputs.size());
for (size_t i = 0; i < tensor_outputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
@ -117,23 +119,13 @@ public:
return false;
}
_qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor();
_tensor_outputs[i] = qnn_tensor;
}
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
config.version = QNN_OPCONFIG_VERSION_1;
auto &op_config = config.v1;
op_config.name = _graph_name.c_str();
op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW;
op_config.typeName = op_name.c_str();
op_config.numOfParams = (uint32_t)_param_types.size();
op_config.params = _param_types.data();
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
op_config.inputTensors = _qnn_tensor_inputs.data();
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
op_config.outputTensors = _qnn_tensor_outputs.data();
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config);
_op_config = op_constructor(_graph_name);
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config());
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
@ -168,8 +160,6 @@ public:
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor();
}
for (size_t i = 0; i < tensor_outputs.size(); i++) {
@ -178,13 +168,16 @@ public:
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor();
}
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
auto error =
_qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(),
_qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr);
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
if (_device == QNN_BACKEND_NPU) {
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
@ -219,10 +212,9 @@ private:
Qnn_GraphHandle_t _graph_handle = nullptr;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<qnn_interface> _qnn_interface;
std::vector<std::shared_ptr<qnn::ggml_qnn_tensor>> _tensor_inputs;
std::vector<std::shared_ptr<qnn::ggml_qnn_tensor>> _tensor_outputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_inputs;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_outputs;
std::unique_ptr<ggml_qnn_op_config> _op_config;
std::vector<Qnn_Param_t> _param_types;
DISABLE_COPY(ggml_qnn_graph);

View file

@ -0,0 +1,73 @@
#pragma once
#include <string>
#include <vector>
#include "ggml-qnn.h"
#include "logger.hpp"
#include "qnn-lib.hpp"
#include "qnn-types.hpp"
#include "tensor.hpp"
namespace qnn {
class ggml_qnn_op_config {
public:
explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) :
_name(name), _package_name(package_name), _op_type(op_type) {}
void set_input_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_inputs) {
_qnn_tensor_inputs.resize(tensor_inputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
_qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor();
}
}
void set_output_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_outputs) {
_qnn_tensor_outputs.resize(tensor_outputs.size());
for (size_t i = 0; i < tensor_outputs.size(); i++) {
_qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor();
}
}
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_SCALAR;
param.name = _param_names.back().c_str();
param.scalarParam = scalar;
_param_types.push_back(param);
}
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() { return _qnn_tensor_outputs; }
Qnn_OpConfig_t get_op_config() {
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
config.version = QNN_OPCONFIG_VERSION_1;
auto &op_config = config.v1;
op_config.name = _name.c_str();
op_config.packageName = _package_name.c_str();
op_config.typeName = _op_type.c_str();
op_config.numOfParams = (uint32_t)_param_types.size();
op_config.params = _param_types.data();
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
op_config.inputTensors = _qnn_tensor_inputs.data();
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
op_config.outputTensors = _qnn_tensor_outputs.data();
return config;
}
private:
std::string _name;
std::string _package_name;
std::string _op_type;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
std::vector<Qnn_Param_t> _param_types;
std::vector<std::string> _param_names;
DISABLE_COPY(ggml_qnn_op_config);
DISABLE_MOVE(ggml_qnn_op_config);
};
} // namespace qnn

View file

@ -8,8 +8,6 @@
#include "ggml-qnn.h"
#include "QnnTensor.h"
#include "System/QnnSystemInterface.h"
#include "buffer.hpp"
#include "logger.hpp"
#include "qnn-lib.hpp"