update CI/action for sycl code, fix CI error of repeat/dup
This commit is contained in:
parent
816f480e98
commit
7a44a95b08
4 changed files with 134 additions and 6 deletions
45
.github/workflows/build.yml
vendored
45
.github/workflows/build.yml
vendored
|
@ -143,6 +143,51 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose
|
||||||
|
|
||||||
|
ubuntu-22.04-cmake-sycl:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
mpi_library: [mpich, libopenmpi-dev]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: add oneAPI to apt
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
cd /tmp
|
||||||
|
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||||
|
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||||
|
|
||||||
|
- name: install oneAPI dpcpp compiler
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||||
|
|
||||||
|
- name: install oneAPI MKL library
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt install intel-oneapi-mkl-devel
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
|
|
|
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
|
||||||
# with CUDA support
|
# with CUDA support
|
||||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
|
||||||
|
# with SYCL support
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
```
|
```
|
||||||
|
|
11
ci/run.sh
11
ci/run.sh
|
@ -10,6 +10,9 @@
|
||||||
# # with CUDA support
|
# # with CUDA support
|
||||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with SYCL support
|
||||||
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
if [ -z ${ONEAPI_ROOT} ]; then
|
||||||
|
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx"
|
||||||
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
|
|
|
@ -3269,10 +3269,6 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
|
||||||
// printf("local buf %p size %d bytes\n", local_buf, total_size);
|
// printf("local buf %p size %d bytes\n", local_buf, total_size);
|
||||||
ggml_sycl_set_device(g_main_device);
|
ggml_sycl_set_device(g_main_device);
|
||||||
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
||||||
|
|
||||||
// printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
|
|
||||||
printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src,
|
|
||||||
main_stream, g_main_device_index);
|
|
||||||
main_stream->memcpy(local_buf, src, total_size);
|
main_stream->memcpy(local_buf, src, total_size);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -7657,6 +7653,20 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||||
*dsti = *xi;
|
*dsti = *xi;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
||||||
|
const int16_t *xi = (const int16_t *)cxi;
|
||||||
|
int16_t *dsti = (int16_t *)cdsti;
|
||||||
|
|
||||||
|
*dsti = *xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||||
|
const int32_t *xi = (const int32_t *)cxi;
|
||||||
|
int32_t *dsti = (int32_t *)cdsti;
|
||||||
|
|
||||||
|
*dsti = *xi;
|
||||||
|
}
|
||||||
|
|
||||||
template <cpy_kernel_t cpy_1>
|
template <cpy_kernel_t cpy_1>
|
||||||
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||||
|
@ -10678,6 +10688,56 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
||||||
|
const int ne00, const int ne01,
|
||||||
|
const int nb00, const int nb01,
|
||||||
|
const int nb02, const int ne10,
|
||||||
|
const int ne11, const int nb10,
|
||||||
|
const int nb11, const int nb12,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
|
||||||
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||||
|
{
|
||||||
|
// dpct::has_capability_or_fail(stream->get_device(),
|
||||||
|
// {sycl::aspect::fp16});
|
||||||
|
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||||
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||||
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||||
|
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
||||||
|
const int ne00, const int ne01,
|
||||||
|
const int nb00, const int nb01,
|
||||||
|
const int nb02, const int ne10,
|
||||||
|
const int ne11, const int nb10,
|
||||||
|
const int nb11, const int nb12,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
|
||||||
|
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||||
|
{
|
||||||
|
// dpct::has_capability_or_fail(stream->get_device(),
|
||||||
|
// {sycl::aspect::fp16});
|
||||||
|
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||||
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||||
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||||
|
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void scale_f32_sycl(const float *x, float *dst, const float scale,
|
static void scale_f32_sycl(const float *x, float *dst, const float scale,
|
||||||
const int k, dpct::queue_ptr stream) {
|
const int k, dpct::queue_ptr stream) {
|
||||||
const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
|
const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
|
||||||
|
@ -11550,8 +11610,6 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
|
||||||
float *dst_dd,
|
float *dst_dd,
|
||||||
const dpct::queue_ptr &main_stream) {
|
const dpct::queue_ptr &main_stream) {
|
||||||
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||||
|
@ -11560,6 +11618,12 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
|
||||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||||
op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
|
op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
|
||||||
main_stream);
|
main_stream);
|
||||||
|
} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||||
|
op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
|
||||||
|
main_stream);
|
||||||
|
} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||||
|
op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
|
||||||
|
main_stream);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
@ -13845,6 +13909,10 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||||
|
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
||||||
|
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||||
|
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||||
|
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue