update CI/action for sycl code, fix CI error of repeat/dup

2024-01-24 14:39:46 +08:00 · 2024-01-24 14:39:46 +08:00 · 7a44a95b08
commit 7a44a95b08
parent 816f480e98
4 changed files with 134 additions and 6 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -143,6 +143,51 @@ jobs:
          cd build
          ctest --verbose

+  ubuntu-22.04-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+
  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
--- a/ci/README.md
+++ b/ci/README.md
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi

+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+      echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+      exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -3269,10 +3269,6 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
        // printf("local buf %p size %d bytes\n", local_buf, total_size);
        ggml_sycl_set_device(g_main_device);
        dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-        // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
-        printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src,
-            main_stream, g_main_device_index);
        main_stream->memcpy(local_buf, src, total_size);
    }
    else {
@ -7657,6 +7653,20 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }

+static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
+    const int16_t *xi = (const int16_t *)cxi;
+    int16_t *dsti = (int16_t *)cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t *xi = (const int32_t *)cxi;
+    int32_t *dsti = (int32_t *)cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@ -10678,6 +10688,56 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
    }
 }

+static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
 static void scale_f32_sycl(const float *x, float *dst, const float scale,
                           const int k, dpct::queue_ptr stream) {
    const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
@ -11550,8 +11610,6 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
                                   float *dst_dd,
                                   const dpct::queue_ptr &main_stream) {

-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
@ -11560,6 +11618,12 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
             main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
+        op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
+             main_stream);
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -13845,6 +13909,10 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
        ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
+        ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));