From 65f895d41b4f5201acca1cf2cabb11b2aae8d58e Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Thu, 4 Jan 2024 23:09:56 +0800
Subject: [PATCH] support main device is non-zero

---
 CMakeLists.txt |  1 +
 ggml-sycl.cpp  | 34 ++++++++++++++++++++++++++++++----
 ggml-sycl.hpp  |  4 +++-
 llama.cpp      | 12 ++++++++++++
 4 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8488689d9..8054b2a4f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,6 +474,7 @@ if (LLAMA_SYCL)
 
     if (_sycl_support)
 	    add_compile_definitions(GGML_USE_CUBLAS)
+	    add_compile_definitions(GGML_USE_SYCL)
 	    #add_compile_definitions(GGML_SYCL_F16)
 	    #add_compile_options(-std=c++17 -O3 -fsycl)
 	add_compile_options(-I./)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index dab9726a7..8353e544b 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -387,8 +387,8 @@ catch (sycl::exception const &exc) {
 
 static int g_device_count = -1;
 static int g_all_sycl_device_count = -1;
-static int g_main_device = 0;
-static int g_main_device_index = 0;
+static int g_main_device = -1;
+static int g_main_device_index = -1;
 
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 
@@ -413,6 +413,10 @@ static size_t g_scratch_offset = 0;
 
 static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
+int get_main_device(){
+    return g_main_device;
+}
+
 [[noreturn]]
 static void bad_arch(const sycl::stream &stream_ct1) {
     stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
@@ -6388,7 +6392,10 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
 }
 
 int get_device_index_by_id(int id){
-    return g_sycl_device_id2index[id].index;
+    int res = g_sycl_device_id2index[id].index;
+    GGML_SYCL_DEBUG("zjy get_device_index_by_id id=%d device_index=%d\n", id, res);
+    GGML_ASSERT(res>=0);
+    return res;
 }
 
 int get_current_device_index(){
@@ -8057,6 +8064,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
+    GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg index %d\n", id);
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8080,6 +8088,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
                         *actual_size = b.size;
                         b.ptr = nullptr;
                         b.size = 0;
+                        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 1 %p\n", ptr);
                         return ptr;
                     }
                 }
@@ -8092,6 +8101,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
         *actual_size = b.size;
         b.ptr = nullptr;
         b.size = 0;
+        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 2 %p\n", ptr);
         return ptr;
     }
     void * ptr;
@@ -8327,10 +8337,19 @@ void ggml_init_cublas() try {
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
 
+        for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+            g_sycl_device_id2index[id].index = -1;
+            g_device_caps[id].vmm = 0;
+            g_device_caps[id].device_id = -1;
+            g_device_caps[id].cc = 0;
+            g_tensor_split[id] = 0;
+        }
+
 
         int device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
             if(id!=user_device_number) continue;
+
             device_inx++;
             int device_vmm = 0;
 
@@ -8400,8 +8419,15 @@ void ggml_init_cublas() try {
 
         //zjy hardcode, force set to 1 device
         g_device_count = 1;
+        ggml_cuda_set_main_device(user_device_number);
         ggml_cuda_set_device(user_device_number);
         fprintf(stderr, "Using Device %d\n", user_device_number);
+
+        // for (int id = 0; id < g_all_sycl_device_count; ++id) {
+        //     GGML_SYCL_DEBUG("zjy id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
+        //     g_device_caps[id].device_id, id, g_sycl_device_id2index[id].index);
+        // }
+
         initialized = true;
         g_cublas_loaded = true;
     }
@@ -11220,7 +11246,7 @@ void ggml_cuda_set_main_device(const int main_device) try {
         return;
     }
 
-    if (g_main_device != main_device && g_device_count > 1) {
+    if (g_main_device != main_device && g_device_count >= 1) {
         g_main_device = main_device;
         g_main_device_index = get_device_index_by_id(g_main_device);
         dpct::device_info prop;
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
index 252184a46..2c56cf47d 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.hpp
@@ -14,4 +14,6 @@
     }                                                                          \
   }()
 
-#define DEBUG_CUDA_MALLOC
\ No newline at end of file
+#define DEBUG_CUDA_MALLOC
+
+int get_main_device();
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index f6f1ec0f4..276647be3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13,6 +13,10 @@
 #  include "ggml-opencl.h"
 #endif
 
+#ifdef GGML_USE_SYCL
+#  include "ggml-sycl.hpp"
+#endif
+
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
@@ -9785,6 +9789,14 @@ struct llama_model * llama_load_model_from_file(
               struct llama_model_params   params) {
     ggml_time_init();
 
+#ifdef GGML_USE_SYCL
+    int main_device = get_main_device();
+    if(main_device>=0) params.main_gpu = main_device;
+    else {
+        LLAMA_LOG_ERROR("%s: missed to init GPU device\n", __func__);
+        std::exit(1);
+    }
+#endif
     llama_model * model = new llama_model;
 
     unsigned cur_percentage = 0;