diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 4d83fd5d1..d264ec766 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -5,6 +5,7 @@ #include "graph.hpp" #include "logger.hpp" +#include "op-config.hpp" #include "tensor.hpp" #include "utils.hpp" @@ -123,40 +124,22 @@ std::string get_graph_key(const std::string &op_name, const std::array -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, const std::string &qnn_op, - const std::array &inputs, - const std::array &outputs) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); - - auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); - auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; - if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); - graph_ptr = it->second.get(); - } else { - auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, - ctx->socinfo.vtcm_size_in_mb); - - if (!graph->is_valid()) { - return nullptr; - } - - if (!graph->build_graph(qnn_op, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { - QNN_LOG_ERROR("build_graph failed\n"); - return nullptr; - } - - graph_ptr = graph.get(); - graph_cache[graph_key] = std::move(graph); +qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &name) { + auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = true; + config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); + return config; + }; } - return graph_ptr; + return [op_name](const std::string &name) { + return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); + }; } constexpr const char *kGgmlOpToQnnOp[] = { @@ -264,6 +247,42 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); +template +qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_context *ctx, size_t op, + const std::array &inputs, + const std::array &outputs) { + GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + + auto &graph_cache = ctx->qnn_graph_cache; + const auto *op_name = + op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); + auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto it = graph_cache.find(graph_key); + qnn::ggml_qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = std::make_unique(graph_key, (QNNBackend)(ctx->device), ctx->instance, + ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), + to_ggml_tensor_array<_OutputSize>(outputs))) { + QNN_LOG_ERROR("build_graph failed\n"); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + template bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); @@ -271,7 +290,7 @@ bool qnn_binary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src0, ggml_t CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); if (graph_ptr) { succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); } @@ -292,7 +311,7 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, kGgmlOpToQnnOp[_GgmlOp], { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); if (graph_ptr) { succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); } @@ -305,7 +324,6 @@ bool qnn_unary_op_impl(ggml_backend_qnn_context *ctx, ggml_tensor *src, ggml_ten return succeed; } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 01190e183..1beb4b31b 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,18 +2,22 @@ #pragma once #include +#include #include +#include #include #include "ggml-qnn.h" #include "logger.hpp" +#include "op-config.hpp" #include "qnn-lib.hpp" #include "tensor.hpp" namespace qnn { using ggml_tensor_array_t = std::vector; +using ggml_op_constructor_t = std::function(const std::string &)>; class ggml_qnn_graph { public: @@ -79,15 +83,15 @@ public: ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } - bool build_graph(const std::string &op_name, const ggml_tensor_array_t &tensor_inputs, + bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(op_constructor); if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _qnn_tensor_inputs.resize(tensor_inputs.size()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -100,11 +104,9 @@ public: return false; } - _qnn_tensor_inputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_inputs[i] = qnn_tensor; } - _qnn_tensor_outputs.resize(tensor_outputs.size()); _tensor_outputs.resize(tensor_outputs.size()); for (size_t i = 0; i < tensor_outputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; @@ -117,23 +119,13 @@ public: return false; } - _qnn_tensor_outputs[i] = qnn_tensor->get_qnn_tensor(); _tensor_outputs[i] = qnn_tensor; } - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _graph_name.c_str(); - op_config.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW; - op_config.typeName = op_name.c_str(); - op_config.numOfParams = (uint32_t)_param_types.size(); - op_config.params = _param_types.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, config); + _op_config = op_constructor(_graph_name); + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { @@ -168,8 +160,6 @@ public: QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_inputs[i] = _tensor_inputs[i]->get_qnn_tensor(); } for (size_t i = 0; i < tensor_outputs.size(); i++) { @@ -178,13 +168,16 @@ public: QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } - - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); } + _op_config->set_input_tensors(_tensor_inputs); + _op_config->set_output_tensors(_tensor_outputs); + auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); + auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); + auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, _qnn_tensor_inputs.data(), _qnn_tensor_inputs.size(), - _qnn_tensor_outputs.data(), _qnn_tensor_outputs.size(), nullptr, nullptr); + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); @@ -219,10 +212,9 @@ private: Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; + std::vector> _tensor_inputs; + std::vector> _tensor_outputs; + std::unique_ptr _op_config; std::vector _param_types; DISABLE_COPY(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp new file mode 100644 index 000000000..de75c9358 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "logger.hpp" +#include "qnn-lib.hpp" +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { +class ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : + _name(name), _package_name(package_name), _op_type(op_type) {} + + void set_input_tensors(const std::vector> &tensor_inputs) { + _qnn_tensor_inputs.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); i++) { + _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); + } + } + + void set_output_tensors(const std::vector> &tensor_outputs) { + _qnn_tensor_outputs.resize(tensor_outputs.size()); + for (size_t i = 0; i < tensor_outputs.size(); i++) { + _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); + } + } + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _param_types.push_back(param); + } + + std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } + + Qnn_OpConfig_t get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_param_types.size(); + op_config.params = _param_types.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; + } + +private: + std::string _name; + std::string _package_name; + std::string _op_type; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _param_types; + std::vector _param_names; + + DISABLE_COPY(ggml_qnn_op_config); + DISABLE_MOVE(ggml_qnn_op_config); +}; +} // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 07fbfde78..b3181ed23 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -8,8 +8,6 @@ #include "ggml-qnn.h" -#include "QnnTensor.h" -#include "System/QnnSystemInterface.h" #include "buffer.hpp" #include "logger.hpp" #include "qnn-lib.hpp"