From d38d4a67d17570d3b3003397a50f873f5e143603 Mon Sep 17 00:00:00 2001
From: "zhou.weiguo" <zhouwg2000@gmail.com>
Date: Sun, 9 Jun 2024 23:49:54 +0800
Subject: [PATCH] npu: probe htp info and capacity of rpc ion memory

---
 ggml-qnn.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 115 insertions(+), 8 deletions(-)

diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp
index d1d69afe2..3248e244a 100644
--- a/ggml-qnn.cpp
+++ b/ggml-qnn.cpp
@@ -152,6 +152,28 @@ enum class ggml_qnn_profile_level {
     profile_detail = 2
 };
 
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+};
+
+enum qcom_chipset {
+    UNKNOWN_SM = 0,
+    SM8450 = 36,  // v69
+    SM8475 = 42,  // v69
+    SM8550 = 43,  // v73
+    SM8650 = 57,  // v75
+};
+
+struct qcom_socinfo {
+    int soc_model;
+    int htp_arch;
+    int vtcm_size_in_mb;
+};
+
 struct ggml_backend_qnn_context {
     int                           device;
     int                           threads;
@@ -216,6 +238,29 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                          .raw_system_interface = {}},
 };
 
+static struct qcom_socinfo g_qnn_soc_info_table[] = {
+    /* Qualcomm SnapDragon 8 Gen 1 */
+    [SM8450] = {.soc_model         = SM8450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8},
+
+    /* Qualcomm SnapDragon 8 Gen 1+ */
+    [SM8475] = {.soc_model         = SM8475,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8},
+
+    /* Qualcomm SnapDragon 8 Gen 2 */
+    [SM8550] = {.soc_model         = SM8550,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8},
+
+    /* Qualcomm SnapDragon 8 Gen 3 */
+    [SM8650] = {.soc_model         = SM8650,
+                .htp_arch          = V75,
+                .vtcm_size_in_mb   = 8},
+
+};
+
 // =================================================================================================
 //
 //  QNN helper functions and other internal helper functions
@@ -485,6 +530,8 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
         return QNN_DATATYPE_INT_8;
     case GGML_TYPE_Q8_0:
         return QNN_DATATYPE_SFIXED_POINT_8;
+    case GGML_TYPE_Q4_0:
+        return QNN_DATATYPE_SFIXED_POINT_4;
     default:
         break;
     }
@@ -527,19 +574,34 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) {
 
 static const char * get_qnn_backend_name(int n_backend_type) {
     switch (n_backend_type) {
-    case 0:
+    case QNN_BACKEND_CPU:
         return "QNN-CPU";
-    case 1:
+    case QNN_BACKEND_GPU:
         return "QNN-GPU";
-    case 2:
+    case QNN_BACKEND_NPU:
         return "QNN-NPU";
-    case 3:
+    case QNN_BACKEND_GGML:
         return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
     default:
         return "unknown";
     }
 }
 
+static const char * qnn_get_chipset_desc(uint32_t chipset_id) {
+    switch (chipset_id) {
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        default:
+            return "unknown";
+    }
+}
+
 static intptr_t align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0
                ? offset
@@ -875,7 +937,7 @@ class qnn_instance {
         return 0;
     }
 
-    std::string &get_qnn_graph_name() { return _graph_name; }
+    std::string & get_qnn_graph_name() { return _graph_name; }
 
     bool is_rpcmem_initialized() { return _rpcmem_initialized; }
 
@@ -893,6 +955,8 @@ class qnn_instance {
 
     void free_rpcmem(void * buf);
 
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+
     bool is_rpcmem_allocated(void * buf);
 
     bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
@@ -977,6 +1041,7 @@ class qnn_instance {
     pfn_rpc_mem_init                   _pfn_rpc_mem_init;
     pfn_rpc_mem_deinit                 _pfn_rpc_mem_deinit;
     std::unordered_map<void *, void *> _rpcmem_store_map;
+    size_t                             _rpcmem_capacity = 512;
 
     std::string _graph_name;
 };
@@ -1493,6 +1558,46 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         QNN_LOG_DEBUG("initialize qnn context successfully\n");
     }
 
+    if (_backend_name.find("Htp") != std::variant_npos) {
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
+        QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+        for (int i = 0; i < p_info->v1.numHwDevices; i++) {
+            QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+            QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+            QnnHtpDevice_Arch_t chiparch = chipinfo.arch;
+            QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
+            QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel,
+                         qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize);
+        }
+        _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+
+
+        //TODO: faster approach to probe the accurate capacity of rpc ion memory
+        size_t candidate_size = 0;
+        uint8_t * rpc_buffer = nullptr;
+        const int SIZE_IN_MB = (1 << 20);
+        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
+        for (size_t idx = 0; idx < probe_counts; idx++) {
+            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
+            if (nullptr == rpc_buffer) {
+                QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+                break;
+            } else {
+                candidate_size = probe_slots[idx];
+                free_rpcmem(rpc_buffer);
+                rpc_buffer = nullptr;
+            }
+        }
+        if (candidate_size > _rpcmem_capacity)
+            _rpcmem_capacity = candidate_size;
+        QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+    }
+
     QNN_LOG_DEBUG("leave qni_init\n");
 
     return 0;
@@ -1654,9 +1759,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx,
     const int64_t ne20 = tensor->ne[0];
     const int64_t ne21 = tensor->ne[1];
 
-    //TODO: support other quatinized data type
-    if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) {
-        return false;
+    //TODO: support other quantized data type
+    if (ggml_is_quantized(src0->type)) {
+        if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) {
+            return false;
+        }
     }
 
     if (b_dump_tensor_info) {