From 7a44a95b08fa8fb23cbafcf2c4f4573b1e6874c6 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 14:39:46 +0800
Subject: [PATCH] update CI/action for sycl code, fix CI error of repeat/dup

---
 .github/workflows/build.yml | 45 +++++++++++++++++++++
 ci/README.md                |  4 ++
 ci/run.sh                   | 11 +++++
 ggml-sycl.cpp               | 80 ++++++++++++++++++++++++++++++++++---
 4 files changed, 134 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3aa6f992..820878a8f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,6 +143,51 @@ jobs:
           cd build
           ctest --verbose
 
+  ubuntu-22.04-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
diff --git a/ci/README.md b/ci/README.md
index 65cfe63eb..406470519 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
diff --git a/ci/run.sh b/ci/run.sh
index 791b17a19..755461e68 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
@@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi
 
+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+      echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+      exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx"
+fi
 ## helpers
 
 # download a file if it does not exist or if it is outdated
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 44ba95200..dba62bd22 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -3269,10 +3269,6 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
         // printf("local buf %p size %d bytes\n", local_buf, total_size);
         ggml_sycl_set_device(g_main_device);
         dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-        // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
-        printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src,
-            main_stream, g_main_device_index);
         main_stream->memcpy(local_buf, src, total_size);
     }
     else {
@@ -7657,6 +7653,20 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
     *dsti = *xi;
 }
 
+static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
+    const int16_t *xi = (const int16_t *)cxi;
+    int16_t *dsti = (int16_t *)cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t *xi = (const int32_t *)cxi;
+    int32_t *dsti = (int32_t *)cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -10678,6 +10688,56 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
     }
 }
 
+static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
 static void scale_f32_sycl(const float *x, float *dst, const float scale,
                            const int k, dpct::queue_ptr stream) {
     const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
@@ -11550,8 +11610,6 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
                                    float *dst_dd,
                                    const dpct::queue_ptr &main_stream) {
 
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
         op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
@@ -11560,6 +11618,12 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
         op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
              main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
+        op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
+             main_stream);
     } else {
         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -13845,6 +13909,10 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
         ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
+        ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));