ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend
This commit is contained in:
parent
3fec68be4e
commit
b0c3013f2e
4 changed files with 4960 additions and 2 deletions
4874
ggml-qnn.cpp
Normal file
4874
ggml-qnn.cpp
Normal file
File diff suppressed because it is too large
Load diff
55
ggml-qnn.h
Normal file
55
ggml-qnn.h
Normal file
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* MIT license
|
||||
* Copyright (C) 2024 GGML Authors
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
#define GGML_QNN_NAME "QNN"
|
||||
#define GGML_QNN_MAX_DEVICES 3
|
||||
|
||||
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
|
||||
enum QNNBackend {
|
||||
QNN_CPU,
|
||||
QNN_GPU,
|
||||
QNN_HTP,
|
||||
};
|
||||
|
||||
GGML_API int ggml_backend_qnn_reg_devices();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP)
|
||||
* @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer
|
||||
* @return
|
||||
*/
|
||||
GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path);
|
||||
|
||||
GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads);
|
||||
|
||||
GGML_API int ggml_backend_qnn_get_device_count(void);
|
||||
GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num);
|
||||
|
||||
|
||||
//temporary API, should be removed in the future
|
||||
GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
3
ggml.c
3
ggml.c
|
@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|||
|
||||
/////////////////////////////////
|
||||
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
//workaround for Qualcomm QNN backend
|
||||
void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(params);
|
||||
|
||||
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
||||
|
|
30
llama.cpp
30
llama.cpp
|
@ -17,6 +17,8 @@
|
|||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
#elif defined(GGML_USE_QNN)
|
||||
# include "ggml-qnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#elif defined(GGML_USE_QNN)
|
||||
buft = ggml_backend_qnn_buffer_type(gpu);
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
|
@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() {
|
|||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#elif defined(GGML_USE_QNN)
|
||||
return ggml_backend_qnn_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
|
@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) {
|
|||
return GGML_SYCL_MAX_DEVICES;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return GGML_VK_MAX_DEVICES;
|
||||
#elif defined(GGML_USE_QNN)
|
||||
return GGML_QNN_MAX_DEVICES;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
|
@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) {
|
|||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
|
@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_QNN)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
//the second param is package name of Andorid app, can be got by JNI from Java layer
|
||||
ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/");
|
||||
if (nullptr == backend) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
|
@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) {
|
|||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
}
|
||||
|
||||
static int llama_has_qnn(void) {
|
||||
#ifdef GGML_USE_QNN
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
|
@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) {
|
|||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
s += "QNN = " + std::to_string(llama_has_qnn()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue