From b7d781ec81eb2bdeedabdf540fdbec37cfb02e90 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 17 Jul 2024 23:08:16 +0800 Subject: [PATCH] remove qnn dedicated unit tests since we're now using the `test-backend-ops` to cross-validate backend ops --- tests/ggml-qnn/CMakeLists.txt | 68 --- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 207 --------- tests/ggml-qnn/ggml-qnn-ut.cpp | 544 ------------------------ 3 files changed, 819 deletions(-) delete mode 100644 tests/ggml-qnn/CMakeLists.txt delete mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh delete mode 100644 tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt deleted file mode 100644 index f9678d3d8..000000000 --- a/tests/ggml-qnn/CMakeLists.txt +++ /dev/null @@ -1,68 +0,0 @@ -cmake_minimum_required(VERSION 3.22.1) -project(ggml-qnn-test) - -set(CMAKE_VERBOSE_MAKEFILE on) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 ON) - -set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) -set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) - -include_directories(${QNN_INC_PATH}) -include_directories(../../ggml/include) # ggml.h, ggml-qnn.h - -set(SOURCE_FILES - ../../ggml/src/ggml.c - ../../ggml/src/ggml-alloc.c - ../../ggml/src/ggml-backend.c - ../../ggml/src/ggml-quants.c - ../../ggml/src/ggml-qnn/qnn-lib.cpp - ../../ggml/src/ggml-qnn/logger.cpp - ../../ggml/src/ggml-qnn/utils.cpp - ../../ggml/src/ggml-qnn/backend-ops.cpp - ../../ggml/src/ggml-qnn.cpp - ggml-qnn-ut.cpp -) - - -message("QNN_SDK_PATH : ${QNN_SDK_PATH}") -message("QNN_INC_PATH : ${QNN_INC_PATH}") -message("QNN_LIB_PATH : ${QNN_LIB_PATH}") - -add_definitions(-D__ARM_NEON) -add_definitions(-DGGML_USE_QNN) - -if(CMAKE_BUILD_TYPE STREQUAL "Release") - add_definitions(-DNDEBUG) - add_definitions(-O3) -else() - add_definitions(-O3) -endif() - -if (TARGET_SNAPDRAGON_8_GEN3) - # the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 - add_definitions(-march=armv8.7-a) - add_definitions(-mcpu=cortex-x1) - add_definitions(-mtune=cortex-x1) -else() - # the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC - add_definitions(-mcpu=cortex-a72) -endif() - -add_compile_options("-Wall" "-Wno-sign-compare") - -find_library(LOG_LIB log) - -link_libraries(${LOG_LIB} android) - -add_executable(${TARGET_NAME} - ${SOURCE_FILES} -) - -target_include_directories(${TARGET_NAME} PRIVATE - ../../ggml/src/ggml-qnn/ -) diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh deleted file mode 100755 index e12b987b8..000000000 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -#QNN SDK released on 20240531 -QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 - -GGML_QNN_UT=ggml-qnn-ut -REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Release -BUILDTYPE=Debug - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${GGML_QNN_UT} - /bin/cp ${GGML_QNN_UT} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -function update_qnn_libs() -{ - check_qnn_sdk - - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - update_qnn_libs - fi -} - - -function build_ggml_qnn_ut() -{ - show_pwd - check_and_download_ndk - check_qnn_sdk - dump_vars - remove_temp_dir - build_arm64 -} - - -function run_ggml_qnn_ut() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_UT} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} - - case "$ggmlop" in - GGML_OP_ADD) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 build (build Android command line UT program)" - echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo -e "\n\n\n" -} - - -unset ggmlop -unset qnnbackend - -check_qnn_sdk - -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "build" ]; then - build_ggml_qnn_ut - exit 0 - elif [ "$1" == "updateqnnlibs" ]; then - update_qnn_libs - exit 0 - else - ggmlop=$1 - qnnbackend=0 - run_ggml_qnn_ut - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 - run_ggml_qnn_ut -else - show_usage - exit 1 -fi diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp deleted file mode 100644 index 71cb86a71..000000000 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ /dev/null @@ -1,544 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ggml.h" - -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "ggml-qnn.h" - -#include "logger.hpp" - -static const char *get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; - default: - return "unknown"; - } -} - -static bool ggml_graph_compute_helper(struct ggml_backend *backend, struct ggml_cgraph *graph, - std::vector &buf, int n_threads, ggml_abort_callback abort_callback, - void *abort_callback_data) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - plan.abort_callback = abort_callback; - plan.abort_callback_data = abort_callback_data; - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } - -#ifdef GGML_USE_QNN - if (ggml_backend_is_qnn(backend)) { - ggml_backend_qnn_set_n_threads(backend, n_threads); - } -#endif - - if (nullptr != backend) - return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; - else - return ggml_graph_compute(graph, &plan); -} - -#define QK8_0 32 - -typedef struct { - uint16_t d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; - -static inline float ggml_compute_fp16_to_fp32(uint16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(uint16_t)); - return (float)tmp; -} - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -static void tensor_dump(const ggml_tensor *tensor, const char *name) { - QNN_LOG_INFO("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - name, tensor->name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], - tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - - float value = 0; - std::ostringstream tmposs; - if (nullptr == tensor) { - QNN_LOG_WARN("tensor is null"); - return; - } - - if (tensor->type == GGML_TYPE_I8) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((int8_t *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F32) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - value = ((float *)tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_F16) { - for (int h = 0; h < tensor->ne[3]; h++) { - for (int i = 0; i < tensor->ne[2]; i++) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - unsigned short tmpvalue = - ((unsigned short *) - tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; - value = GGML_FP16_TO_FP32(tmpvalue); - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - tmposs << "\n"; - } - } - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } - - if (tensor->type == GGML_TYPE_Q8_0) { - block_q8_0 *tmp = ((block_q8_0 *)tensor->data); - for (int j = 0; j < tensor->ne[1]; j++) { - int n = tensor->ne[0] / QK8_0; // blocks per row - for (int z = 0; z < n; z++) { - const float d = GGML_FP16_TO_FP32(tmp[j * n + z].d); - for (int k = 0; k < QK8_0; k++) { - value = tmp[j * n + z].qs[k] * d; - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; - } - } - tmposs << "\n"; - } - if (strlen(tmposs.str().c_str()) <= (QNN_LOGBUF_LEN - 96)) { - QNN_LOG_INFO("\n%s\n", tmposs.str().c_str()); - tmposs.clear(); - tmposs.str(""); - } - } -} - -static uint32_t get_tensor_rank(const ggml_tensor *tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -static uint32_t get_tensor_data_size(const ggml_tensor *tensor) { -#if ENABLE_QNNSDK_LOG - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = get_tensor_rank(tensor); - for (size_t i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); - QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); -#endif - - return ggml_nbytes(tensor); -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 -static void init_tensor_uniform(ggml_tensor *tensor, float min = -1.0f, float max = 1.0f) { - size_t size = ggml_nelements(tensor); - std::vector data(size); - for (size_t i = 0; i < size; i++) { - data[i] = i + 1; - } - - if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), size * sizeof(float)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); -#endif - } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { - GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_row_size(tensor->type, size)); - std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix - const float *im = imatrix.data(); - if (!ggml_quantize_requires_imatrix(tensor->type)) { - // when the imatrix is optional, we want to test both quantization with and without imatrix - // use one of the random numbers to decide - if (data[0] > 0.5f * (min + max)) { - im = nullptr; - } - } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size / tensor->ne[0], tensor->ne[0], im); - GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, dataq.data(), dataq.size()); -#else - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); -#endif - } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { - // This is going to create some weird integers though. -#ifdef GGML_USE_QNN - memcpy((char *)tensor->data, data.data(), ggml_nbytes(tensor)); -#else - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); -#endif - } else { - GGML_ASSERT(false); - } -} - -// ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 -static void initialize_tensors(ggml_context *ctx) { - for (ggml_tensor *t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t); - } -} - -static void show_usage() { - printf( - " " - "\nUsage: test_qnn_ops [options]\n" - "\n" - "Options:\n" - " -t GGML_OP_ADD / GGML_OP_MULMAT\n" - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" - " ?/h print usage infomation\n\n"); -} - -typedef ggml_tensor *(*ggml_op_unary_t)(ggml_context *ctx, ggml_tensor *a); - -typedef ggml_tensor *(*ggml_op_binary_t)(ggml_context *ctx, ggml_tensor *a, ggml_tensor *b); - -static constexpr const ggml_op_unary_t kUnaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - ggml_sqrt, // GGML_OP_SQRT - ggml_log, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - nullptr, // GGML_OP_MUL_MAT -}; - -static constexpr const ggml_op_binary_t kBinaryOps[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - ggml_sub, // GGML_OP_SUB - ggml_mul, // GGML_OP_MUL - ggml_div, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - ggml_mul_mat, // GGML_OP_MUL_MAT -}; - -static_assert(kBinaryOps[GGML_OP_MUL_MAT] == ggml_mul_mat, "ggml_mul_mat at wrong index, check kBinaryOps"); - -static void qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type, ggml_type qtype, - std::vector &results) { - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - size_t ctx_size = 0; - int sizey = 4; - int sizex = 4; - - struct ggml_context *ctx = nullptr; - struct ggml_cgraph *gf = nullptr; - struct ggml_tensor *src0 = nullptr; - struct ggml_tensor *src1 = nullptr; - struct ggml_tensor *dst = nullptr; - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buffer = nullptr; - - std::vector work_buffer; - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - - n_begin_time = ggml_time_us(); - - ctx_size += 1024 * 1024 * 32; - QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, (ctx_size / 1024 / 1024)); - - struct ggml_init_params params = { /*.mem_size =*/ctx_size, - /*.mem_buffer =*/NULL, - /* no_alloc =*/0 }; - - if (n_backend_type != QNN_BACKEND_GGML) { - params.no_alloc = true; - backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); - if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); - return; - } - } - - ctx = ggml_init(params); - if (!ctx) { - QNN_LOG_ERROR("%s: ggml_init() failed\n"); - return; - } - - QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); - if (ggml_is_quantized(qtype)) { - sizex = ggml_blck_size(qtype); - - if (n_ggml_op_type == GGML_OP_MUL_MAT) { - sizex = ggml_blck_size(qtype) * 2; - } - } - QNN_LOG_DEBUG("sizex: %d\n", sizex); - QNN_LOG_DEBUG("sizey: %d\n", sizey); - - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - - ggml_set_input(src0); - ggml_set_input(src1); - - auto unary_op = kUnaryOps[n_ggml_op_type]; - auto binary_op = kBinaryOps[n_ggml_op_type]; - if (unary_op) { - dst = unary_op(ctx, src0); - } else if (binary_op) { - dst = binary_op(ctx, src0, src1); - } else { - QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type)); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - - ggml_set_output(dst); -#ifdef GGML_USE_QNN - if (n_backend_type != QNN_BACKEND_GGML) { - buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - if (!buffer) { - QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); - ggml_free(ctx); - ggml_backend_free(backend); - return; - } - } -#endif - - QNN_LOG_DEBUG("creating compute graph\n"); - gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, dst); - - initialize_tensors(ctx); - - ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); - - if (get_tensor_data_size(dst) < (32 * 32)) { - QNN_LOG_DEBUG("dump tensors:\n"); - TENSOR_DUMP(src0); - TENSOR_DUMP(src1); - TENSOR_DUMP(dst); - } else { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - results.resize(ggml_nbytes(dst)); - memcpy(results.data(), ggml_get_data(dst), ggml_nbytes(dst)); - ggml_free(ctx); - ggml_backend_buffer_free(buffer); - ggml_backend_free(backend); - - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", - ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); -} - -#define DEFINE_OP(op) { #op, op } - -static const std::unordered_map kMapStringToGGMLOp = { - DEFINE_OP(GGML_OP_ADD), DEFINE_OP(GGML_OP_SUB), DEFINE_OP(GGML_OP_MUL), DEFINE_OP(GGML_OP_DIV), - DEFINE_OP(GGML_OP_SQRT), DEFINE_OP(GGML_OP_MUL_MAT), DEFINE_OP(GGML_OP_LOG), -}; - -#define CONSOLE_RED "\033[31m" -#define CONSOLE_GREEN "\033[32m" -#define CONSOLE_RESET "\033[0m" - -int main(int argc, char *argv[]) { - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; - - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - auto it = kMapStringToGGMLOp.find(argv[i + 1]); - if (it != kMapStringToGGMLOp.end()) { - n_ggml_op_type = it->second; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_GGML) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } - - QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, - ggml_op_name((enum ggml_op)n_ggml_op_type)); - - std::vector results; - qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type, GGML_TYPE_F32, results); - std::vector cpu_results; - qnn_op_ut(num_threads, QNN_BACKEND_GGML, n_ggml_op_type, GGML_TYPE_F32, cpu_results); - - // TODO: theoretically, the results should be the same, but the results may be different due to the different hardware - // a better way to compare the results is to compare the floating point numbers with allowed error - if (results == cpu_results) { - QNN_LOG_INFO(CONSOLE_GREEN "[Success] results equal to CPU backend!" CONSOLE_RESET); - return 0; - } else { - QNN_LOG_ERROR(CONSOLE_RED "[Failed] results mismatch with CPU backend!" CONSOLE_RESET); - return 1; - } -}