ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend
This commit is contained in:
parent
3fec68be4e
commit
b0c3013f2e
4 changed files with 4960 additions and 2 deletions
4874
ggml-qnn.cpp
Normal file
4874
ggml-qnn.cpp
Normal file
File diff suppressed because it is too large
Load diff
55
ggml-qnn.h
Normal file
55
ggml-qnn.h
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
/*
|
||||||
|
* MIT license
|
||||||
|
* Copyright (C) 2024 GGML Authors
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
* this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define GGML_QNN_NAME "QNN"
|
||||||
|
#define GGML_QNN_MAX_DEVICES 3
|
||||||
|
|
||||||
|
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
|
||||||
|
enum QNNBackend {
|
||||||
|
QNN_CPU,
|
||||||
|
QNN_GPU,
|
||||||
|
QNN_HTP,
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API int ggml_backend_qnn_reg_devices();
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP)
|
||||||
|
* @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads);
|
||||||
|
|
||||||
|
GGML_API int ggml_backend_qnn_get_device_count(void);
|
||||||
|
GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size);
|
||||||
|
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num);
|
||||||
|
|
||||||
|
|
||||||
|
//temporary API, should be removed in the future
|
||||||
|
GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
3
ggml.c
3
ggml.c
|
@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
|
||||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
//workaround for Qualcomm QNN backend
|
||||||
|
void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(params);
|
GGML_ASSERT(params);
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
||||||
|
|
30
llama.cpp
30
llama.cpp
|
@ -17,6 +17,8 @@
|
||||||
# include "ggml-sycl.h"
|
# include "ggml-sycl.h"
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
# include "ggml-kompute.h"
|
# include "ggml-kompute.h"
|
||||||
|
#elif defined(GGML_USE_QNN)
|
||||||
|
# include "ggml-qnn.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_QNN)
|
||||||
|
buft = ggml_backend_qnn_buffer_type(gpu);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
|
@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() {
|
||||||
return ggml_backend_sycl_get_device_count();
|
return ggml_backend_sycl_get_device_count();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
return ggml_backend_vk_get_device_count();
|
return ggml_backend_vk_get_device_count();
|
||||||
|
#elif defined(GGML_USE_QNN)
|
||||||
|
return ggml_backend_qnn_get_device_count();
|
||||||
#else
|
#else
|
||||||
return 1;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
|
@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) {
|
||||||
return GGML_SYCL_MAX_DEVICES;
|
return GGML_SYCL_MAX_DEVICES;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
return GGML_VK_MAX_DEVICES;
|
return GGML_VK_MAX_DEVICES;
|
||||||
|
#elif defined(GGML_USE_QNN)
|
||||||
|
return GGML_QNN_MAX_DEVICES;
|
||||||
#else
|
#else
|
||||||
return 1;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
|
@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) {
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
|
@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_QNN)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
//the second param is package name of Andorid app, can be got by JNI from Java layer
|
||||||
|
ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/");
|
||||||
|
if (nullptr == backend) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) {
|
||||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int llama_has_qnn(void) {
|
||||||
|
#ifdef GGML_USE_QNN
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
const char * llama_print_system_info(void) {
|
const char * llama_print_system_info(void) {
|
||||||
static std::string s;
|
static std::string s;
|
||||||
|
|
||||||
|
@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||||
|
s += "QNN = " + std::to_string(llama_has_qnn()) + " | ";
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue