step 8, rename all macro & func from cuda by sycl
This commit is contained in:
parent
3b1a743e82
commit
c2ef7a9cb9
9 changed files with 1102 additions and 1020 deletions
|
@ -104,6 +104,7 @@ option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fas
|
|||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
|
@ -448,14 +449,13 @@ endif()
|
|||
|
||||
if (LLAMA_SYCL)
|
||||
set(ENABLE_AOT ats)
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Intel")
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
|
||||
message(WARNING "${CMAKE_C_COMPILER_ID} Need IntelLLVM for SYCL")
|
||||
endif()
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
|
||||
message(WARNING "${CMAKE_CXX_COMPILER_ID} Need IntelLLVM for SYCL")
|
||||
endif()
|
||||
|
||||
#find_package(SYCL REQUIRED)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
|
||||
# Check SYCL support by the compiler
|
||||
|
@ -473,10 +473,9 @@ if (LLAMA_SYCL)
|
|||
endif()
|
||||
|
||||
if (_sycl_support)
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
#add_compile_definitions(GGML_USE_CUBLAS)
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
#add_compile_definitions(GGML_SYCL_F16)
|
||||
#add_compile_options(-std=c++17 -O3 -fsycl)
|
||||
add_compile_options(-I./)
|
||||
add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
|
||||
add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
|
||||
|
|
|
@ -42,6 +42,10 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUBLAS_SYCL
|
||||
#endif
|
||||
|
||||
int32_t get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
|
@ -601,9 +605,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CLBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CLBLAS_SYCL
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -620,14 +624,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CLBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CLBLAS_SYCL
|
||||
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
|
@ -645,9 +651,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CLBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CLBLAS_SYCL
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
|
@ -1009,6 +1015,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#ifdef GGML_USE_CLBLAS
|
||||
printf(" -nommq, --no-mul-mat-q\n");
|
||||
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif // GGML_USE_CLBLAS
|
||||
#ifdef GGML_USE_SYCL
|
||||
printf(" -nommq, --no-mul-mat-q\n");
|
||||
printf(" use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif // GGML_USE_SYCL
|
||||
#endif
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
|
|
|
@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
|
@ -2345,7 +2345,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||
{
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
||||
|
@ -2358,7 +2358,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
||||
|
|
|
@ -337,6 +337,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|||
ggml_backend_cuda_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
extern void ggml_backend_sycl_reg_devices(void);
|
||||
ggml_backend_sycl_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
|
1948
ggml-sycl.cpp
1948
ggml-sycl.cpp
File diff suppressed because it is too large
Load diff
|
@ -1,19 +1,59 @@
|
|||
#include <sycl/sycl.hpp>
|
||||
#include <dpct/dpct.hpp>
|
||||
// typedef sycl::half ggml_fp16_t;
|
||||
#pragma once
|
||||
|
||||
#define CHECK_TRY_ERROR(expr) \
|
||||
[&]() { \
|
||||
try { \
|
||||
expr; \
|
||||
return dpct::success; \
|
||||
} catch (std::exception const &e) { \
|
||||
std::cerr << e.what()<< "\nException caught at file:" << __FILE__ \
|
||||
<< ", line:" << __LINE__ <<", func:"<<__func__<< std::endl; \
|
||||
return dpct::default_error; \
|
||||
} \
|
||||
}()
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_MAX_DEVICES 16
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
|
||||
// Always success. To check if SYCL is actually loaded, use `ggml_sycl_loaded`.
|
||||
GGML_API void ggml_init_sycl(void);
|
||||
|
||||
// Returns `true` if there are available SYCL devices and cublas loads successfully; otherwise, it returns `false`.
|
||||
GGML_API bool ggml_sycl_loaded(void);
|
||||
|
||||
GGML_API void * ggml_sycl_host_malloc(size_t size);
|
||||
GGML_API void ggml_sycl_host_free(void * ptr);
|
||||
|
||||
GGML_API bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_sycl_set_tensor_split(const float * tensor_split);
|
||||
GGML_API void ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_sycl_free_data(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
||||
GGML_API void ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_sycl_set_main_device(int main_device);
|
||||
GGML_API void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
|
||||
GGML_API void ggml_sycl_set_scratch_size(size_t scratch_size);
|
||||
GGML_API void ggml_sycl_free_scratch(void);
|
||||
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API int ggml_sycl_get_device_count(void);
|
||||
GGML_API void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
GGML_API int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
|
||||
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
int get_main_device();
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
24
ggml.c
24
ggml.c
|
@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
#include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.hpp"
|
||||
#endif
|
||||
|
||||
// floating point type used to accumulate sums
|
||||
|
@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
ggml_init_cublas();
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
ggml_init_sycl();
|
||||
#endif
|
||||
|
||||
ggml_setup_op_has_task_pass();
|
||||
|
@ -14687,6 +14691,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#endif // GGML_USE_SYCL
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
|
@ -20263,7 +20275,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||
}
|
||||
|
||||
int ggml_cpu_has_blas(void) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
|
@ -20286,8 +20298,16 @@ int ggml_cpu_has_clblast(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sycl(void) {
|
||||
#if defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
|
17
llama.cpp
17
llama.cpp
|
@ -11,9 +11,7 @@
|
|||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
# include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.hpp"
|
||||
#endif
|
||||
|
||||
|
@ -1260,6 +1258,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||
if (host_buffer) {
|
||||
buft = ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#endif
|
||||
|
@ -1279,6 +1279,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#endif
|
||||
|
@ -9935,6 +9937,15 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ctx->backend = ggml_backend_sycl_init(0);
|
||||
if (ctx->backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL backend\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
|
|
5
llama.h
5
llama.h
|
@ -6,6 +6,9 @@
|
|||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.hpp"
|
||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
@ -46,7 +49,7 @@
|
|||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue