From 7a44a95b08fa8fb23cbafcf2c4f4573b1e6874c6 Mon Sep 17 00:00:00 2001 From: jianyuzh Date: Wed, 24 Jan 2024 14:39:46 +0800 Subject: [PATCH] update CI/action for sycl code, fix CI error of repeat/dup --- .github/workflows/build.yml | 45 +++++++++++++++++++++ ci/README.md | 4 ++ ci/run.sh | 11 +++++ ggml-sycl.cpp | 80 ++++++++++++++++++++++++++++++++++--- 4 files changed, 134 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c3aa6f992..820878a8f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -143,6 +143,51 @@ jobs: cd build ctest --verbose + ubuntu-22.04-cmake-sycl: + runs-on: ubuntu-22.04 + + continue-on-error: true + + strategy: + matrix: + mpi_library: [mpich, libopenmpi-dev] + + steps: + - uses: actions/checkout@v2 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) + # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 diff --git a/ci/README.md b/ci/README.md index 65cfe63eb..406470519 100644 --- a/ci/README.md +++ b/ci/README.md @@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt # with CUDA support GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + +# with SYCL support +source /opt/intel/oneapi/setvars.sh +GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ``` diff --git a/ci/run.sh b/ci/run.sh index 791b17a19..755461e68 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -10,6 +10,9 @@ # # with CUDA support # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with SYCL support +# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" fi +if [ ! -z ${GG_BUILD_SYCL} ]; then + if [ -z ${ONEAPI_ROOT} ]; then + echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh" + exit 1 + fi + + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" +fi ## helpers # download a file if it does not exist or if it is outdated diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 44ba95200..dba62bd22 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3269,10 +3269,6 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo // printf("local buf %p size %d bytes\n", local_buf, total_size); ggml_sycl_set_device(g_main_device); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; - - // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data); - printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src, - main_stream, g_main_device_index); main_stream->memcpy(local_buf, src, total_size); } else { @@ -7657,6 +7653,20 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) { *dsti = *xi; } +static void cpy_1_i16_i16(const char * cxi, char * cdsti) { + const int16_t *xi = (const int16_t *)cxi; + int16_t *dsti = (int16_t *)cdsti; + + *dsti = *xi; +} + +static void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t *xi = (const int32_t *)cxi; + int32_t *dsti = (int32_t *)cdsti; + + *dsti = *xi; +} + template static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, @@ -10678,6 +10688,56 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, } } +static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + static void scale_f32_sycl(const float *x, float *dst, const float scale, const int k, dpct::queue_ptr stream) { const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; @@ -11550,8 +11610,6 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0, float *dst_dd, const dpct::queue_ptr &main_stream) { - GGML_ASSERT(src1->type == GGML_TYPE_F32); - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { @@ -11560,6 +11618,12 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0, } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, + main_stream); + } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { + op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, + main_stream); } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); @@ -13845,6 +13909,10 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { + ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type));