step 8, rename all macro & func from cuda by sycl

This commit is contained in:
jianyuzh 2024-01-07 16:55:55 +08:00 committed by Meng, Hengyu
parent 3b1a743e82
commit c2ef7a9cb9
9 changed files with 1102 additions and 1020 deletions

View file

@ -104,6 +104,7 @@ option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fas
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -448,14 +449,13 @@ endif()
if (LLAMA_SYCL)
set(ENABLE_AOT ats)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Intel")
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
message(WARNING "${CMAKE_C_COMPILER_ID} Need IntelLLVM for SYCL")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
message(WARNING "${CMAKE_CXX_COMPILER_ID} Need IntelLLVM for SYCL")
endif()
#find_package(SYCL REQUIRED)
find_package(IntelSYCL REQUIRED)
# Check SYCL support by the compiler
@ -473,10 +473,9 @@ if (LLAMA_SYCL)
endif()
if (_sycl_support)
add_compile_definitions(GGML_USE_CUBLAS)
#add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_SYCL)
#add_compile_definitions(GGML_SYCL_F16)
#add_compile_options(-std=c++17 -O3 -fsycl)
add_compile_options(-I./)
add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)

View file

@ -42,6 +42,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL
#endif
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@ -601,9 +605,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS
#ifndef GGML_USE_CLBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CLBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
@ -620,14 +624,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
#ifndef GGML_USE_CLBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CLBLAS_SYCL
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@ -645,9 +651,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f;
}
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS
#ifndef GGML_USE_CLBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CLBLAS_SYCL
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--numa") {
@ -1009,6 +1015,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#ifdef GGML_USE_CLBLAS
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CLBLAS
#ifdef GGML_USE_SYCL
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_SYCL
#endif
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");

View file

@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@ -2345,7 +2345,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{
#ifdef GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.mul_mat_q = false;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@ -2358,7 +2358,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});

View file

@ -337,6 +337,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices();
#endif
#ifdef GGML_USE_SYCL
extern void ggml_backend_sycl_reg_devices(void);
ggml_backend_sycl_reg_devices();
#endif
#ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

File diff suppressed because it is too large Load diff

View file

@ -1,19 +1,59 @@
#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
// typedef sycl::half ggml_fp16_t;
#pragma once
#define CHECK_TRY_ERROR(expr) \
[&]() { \
try { \
expr; \
return dpct::success; \
} catch (std::exception const &e) { \
std::cerr << e.what()<< "\nException caught at file:" << __FILE__ \
<< ", line:" << __LINE__ <<", func:"<<__func__<< std::endl; \
return dpct::default_error; \
} \
}()
#include "ggml.h"
#include "ggml-backend.h"
// #define DEBUG_CUDA_MALLOC
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 16
#define GGML_SYCL_NAME "SYCL"
// Always success. To check if SYCL is actually loaded, use `ggml_sycl_loaded`.
GGML_API void ggml_init_sycl(void);
// Returns `true` if there are available SYCL devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API bool ggml_sycl_loaded(void);
GGML_API void * ggml_sycl_host_malloc(size_t size);
GGML_API void ggml_sycl_host_free(void * ptr);
GGML_API bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_sycl_set_tensor_split(const float * tensor_split);
GGML_API void ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
GGML_API void ggml_sycl_free_data(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
GGML_API void ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
GGML_API void ggml_sycl_set_main_device(int main_device);
GGML_API void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
GGML_API void ggml_sycl_set_scratch_size(size_t scratch_size);
GGML_API void ggml_sycl_free_scratch(void);
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API int ggml_sycl_get_device_count(void);
GGML_API void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
// backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
GGML_API int ggml_backend_sycl_get_device(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
int get_main_device();
#ifdef __cplusplus
}
#endif

24
ggml.c
View file

@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.hpp"
#endif
// floating point type used to accumulate sums
@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST)
ggml_cl_init();
#elif defined(GGML_USE_SYCL)
ggml_init_sycl();
#endif
ggml_setup_op_has_task_pass();
@ -14687,6 +14691,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS
#ifdef GGML_USE_SYCL
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_SYCL
switch (tensor->op) {
case GGML_OP_DUP:
{
@ -20263,7 +20275,7 @@ int ggml_cpu_has_wasm_simd(void) {
}
int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
@ -20286,8 +20298,16 @@ int ggml_cpu_has_clblast(void) {
#endif
}
int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
}
int ggml_cpu_has_sse3(void) {

View file

@ -11,9 +11,7 @@
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h"
#endif
#ifdef GGML_USE_SYCL
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.hpp"
#endif
@ -1260,6 +1258,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type();
}
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#endif
@ -1279,6 +1279,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#endif
@ -9935,6 +9937,15 @@ struct llama_context * llama_new_context_with_model(
}
}
}
#elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) {
ctx->backend = ggml_backend_sycl_init(0);
if (ctx->backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL backend\n", __func__);
}
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {

View file

@ -6,6 +6,9 @@
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.hpp"
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif