ggml : add unified SYCL backend for Intel GPUs (#2690)
* first update for migration * update init_cublas * add debug functio, commit all help code * step 1 * step 2 * step3 add fp16, slower 31->28 * add GGML_LIST_DEVICE function * step 5 format device and print * step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue * support main device is non-zero * step7 add debug for code path, rm log * step 8, rename all macro & func from cuda by sycl * fix error of select non-zero device, format device list * ren ggml-sycl.hpp -> ggml-sycl.h * clear CMAKE to rm unused lib and options * correct queue: rm dtct:get_queue * add print tensor function to debug * fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481 * summary dpct definition in one header file to replace folder:dpct * refactor device log * mv dpct definition from folder dpct to ggml-sycl.h * update readme, refactor build script * fix build with sycl * set nthread=1 when sycl, increase performance * add run script, comment debug code * add ls-sycl-device tool * add ls-sycl-device, rm unused files * rm rear space * dos2unix * Update README_sycl.md * fix return type * remove sycl version from include path * restore rm code to fix hang issue * add syc and link for sycl readme * rm original sycl code before refactor * fix code err * add know issue for pvc hang issue * enable SYCL_F16 support * align pr4766 * check for sycl blas, better performance * cleanup 1 * remove extra endif * add build&run script, clean CMakefile, update guide by review comments * rename macro to intel hardware * editor config format * format fixes * format fixes * editor format fix * Remove unused headers * skip build sycl tool for other code path * replace tab by space * fix blas matmul function * fix mac build * restore hip dependency * fix conflict * ren as review comments * mv internal function to .cpp file * export funciton print_sycl_devices(), mv class dpct definition to source file * update CI/action for sycl code, fix CI error of repeat/dup * fix action ID format issue * rm unused strategy * enable llama_f16 in ci * fix conflict * fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml * fix ci cases for unsupported data type * revert unrelated changed in cuda cmake remove useless nommq fix typo of GGML_USE_CLBLAS_SYCL * revert hip cmake changes * fix indent * add prefix in func name * revert no mmq * rm cpu blas duplicate * fix no_new_line * fix src1->type==F16 bug. * pass batch offset for F16 src1 * fix batch error * fix wrong code * revert sycl checking in test-sampling * pass void as arguments of ggml_backend_sycl_print_sycl_devices * remove extra blank line in test-sampling * revert setting n_threads in sycl * implement std::isinf for icpx with fast math. * Update ci/run.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * add copyright and MIT license declare * update the cmd example --------- Co-authored-by: jianyuzh <jianyu.zhang@intel.com> Co-authored-by: luoyu-intel <yu.luo@intel.com> Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
b764b8f1d0
commit
0f648573dd
22 changed files with 15764 additions and 24 deletions
|
@ -42,6 +42,10 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUBLAS_SYCL
|
||||
#endif
|
||||
|
||||
int32_t get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
|
@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
|
@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
|
@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue