ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend

This commit is contained in:
zhou.weiguo 2024-04-24 16:28:18 +08:00
parent 3fec68be4e
commit b0c3013f2e
No known key found for this signature in database
GPG key ID: 952EA81D18BB2FA8
4 changed files with 4960 additions and 2 deletions

4874
ggml-qnn.cpp Normal file

File diff suppressed because it is too large Load diff

55
ggml-qnn.h Normal file
View file

@ -0,0 +1,55 @@
/*
* MIT license
* Copyright (C) 2024 GGML Authors
* SPDX-License-Identifier: MIT
*
* this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend
*/
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_QNN_NAME "QNN"
#define GGML_QNN_MAX_DEVICES 3
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
enum QNNBackend {
QNN_CPU,
QNN_GPU,
QNN_HTP,
};
GGML_API int ggml_backend_qnn_reg_devices();
/**
*
* @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP)
* @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer
* @return
*/
GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path);
GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend);
GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads);
GGML_API int ggml_backend_qnn_get_device_count(void);
GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size);
GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num);
//temporary API, should be removed in the future
GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif

3
ggml.c
View file

@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
/////////////////////////////////
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
//workaround for Qualcomm QNN backend
void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {

View file

@ -17,6 +17,8 @@
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
#elif defined(GGML_USE_QNN)
# include "ggml-qnn.h"
#endif
#ifdef GGML_USE_METAL
@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#elif defined(GGML_USE_QNN)
buft = ggml_backend_qnn_buffer_type(gpu);
#endif
if (buft == nullptr) {
@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() {
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_QNN)
return ggml_backend_qnn_get_device_count();
#else
return 1;
#endif
@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) {
return GGML_SYCL_MAX_DEVICES;
#elif defined(GGML_USE_VULKAN)
return GGML_VK_MAX_DEVICES;
#elif defined(GGML_USE_QNN)
return GGML_QNN_MAX_DEVICES;
#else
return 1;
#endif
@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_QNN)
if (model->n_gpu_layers > 0) {
//the second param is package name of Andorid app, can be got by JNI from Java layer
ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/");
if (nullptr == backend) {
LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) {
ctx->t_p_eval_us = ctx->n_p_eval = 0;
}
static int llama_has_qnn(void) {
#ifdef GGML_USE_QNN
return 1;
#else
return 0;
#endif
}
const char * llama_print_system_info(void) {
static std::string s;
@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) {
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
s += "QNN = " + std::to_string(llama_has_qnn()) + " | ";
return s.c_str();
}