From c9ae1916ec19b2840cb404d6f9ca288c15cb63e4 Mon Sep 17 00:00:00 2001 From: Chen Xi Date: Thu, 26 Sep 2024 02:14:04 +0000 Subject: [PATCH] add tensor parallel support Signed-off-by: Chen Xi --- ggml/include/ggml-sycl.h | 2 ++ ggml/include/ggml.h | 8 ++++---- ggml/src/CMakeLists.txt | 11 +++++++++++ ggml/src/ggml-sycl.cpp | 16 ++++++++-------- ggml/src/ggml-sycl/dpct/helper.hpp | 17 +++++++++++------ src/llama.cpp | 18 ++++++++---------- 6 files changed, 44 insertions(+), 28 deletions(-) diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 43ab1519c..188f51d4a 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_API int ggml_backend_sycl_rank(void); +GGML_API int ggml_backend_sycl_world_size(void); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a5813839f..9a9732824 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -599,8 +599,8 @@ extern "C" { TENSOR_NO_CHANGE, TENSOR_SPLIT_BY_ROW, TENSOR_SPLIT_BY_COLUMN, - TENSOR_KEEPED_ON_MASTER, - } + TENSOR_KEEPED_ON_MASTER + }; // n-dimensional tensor struct ggml_tensor { @@ -637,9 +637,9 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE; + enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE}; - // char padding[4]; + char padding[12]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbc349500..538be824e 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -566,6 +566,17 @@ if (GGML_SYCL) list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl) endif() + set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL") + set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include") + set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/") + set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include") + set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/") + include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH}) + find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH}) + find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH}) + # find_package(oneCCL REQUIRED) + message("-- oneCCL found") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH}) if (WIN32) find_package(IntelSYCL REQUIRED) find_package(MKL REQUIRED) diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index a9217658b..bf70c155b 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) int ggml_backend_sycl_rank() { // use ccl rank as main gpu - return dpct::dev_mgr::instance().get_ccl_rank(); + return dpct::dev_mgr::instance().get_rank(); } int ggml_backend_sycl_world_size() { // use ccl rank as main gpu - return dpct::dev_mgr::instance().get_ccl_world_size(); + return dpct::dev_mgr::instance().get_world_size(); } void ggml_backend_sycl_print_sycl_devices() { @@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) { std::exit(1); } -static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) { - int rank = ggml_backend_sycl_rank() - int world_size = ggml_backend_sycl_world_size() +static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) { + int rank = ggml_backend_sycl_rank(); + int world_size = ggml_backend_sycl_world_size(); auto type_traits = ggml_internal_get_type_traits(src->type); size_t element_size = type_traits.type_size / type_traits.blck_size; const int64_t dst_size = ggml_nelements(src) * element_size / world_size; @@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) { memcpy(host_buf, data, size); } else { - if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) { + if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) { std::cerr << "split tensor failed!" << std::endl; } } @@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { static bool ggml_backend_sycl_buffer_type_initialized = false; if (!ggml_backend_sycl_buffer_type_initialized) { - if (dpct::dev_mgr::instance().world_size() > 1) { - auto rank = dpct::dev_mgr::instance().get_rank(); + if (ggml_backend_sycl_world_size() > 1) { + auto rank = ggml_backend_sycl_rank(); auto & device_tp = dpct::dev_mgr::instance().get_device(rank); queue_ptr stream = &(device_tp.default_queue()); // TODO(xi): buffer_types always use 0 to avoid changes on public code diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 700f65cc6..d73011c23 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -13,11 +13,13 @@ #ifndef GGML_SYCL_DPCT_HELPER_HPP #define GGML_SYCL_DPCT_HELPER_HPP +#include #include #include #include #include #include +#include #include "ggml.h" @@ -480,8 +482,6 @@ namespace dpct int _max_nd_range_size_i[3]; uint32_t _device_id; std::array _uuid; - uint32_t _rank; - uint32_t _world_size; }; static int get_major_version(const sycl::device &dev) @@ -873,8 +873,8 @@ namespace dpct } return -1; } - inline int get_ccl_rank() { return _rank; } - inline int get_ccl_world_size() { return _world_size; } + inline int get_rank() { return _rank; } + inline int get_world_size() { return _world_size; } inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) { return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs); @@ -1002,7 +1002,13 @@ namespace dpct return convert_backend_index(backend1) < convert_backend_index(backend2); } - static void init_ccl() { + static void mpi_finalize() { + static int is_finalized = 0; + MPI_Finalized(&is_finalized); + if (!is_finalized) MPI_Finalize(); + } + + void init_ccl() { ccl::init(); MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &_world_size); @@ -1018,7 +1024,6 @@ namespace dpct MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); _kvs = ccl::create_kvs(main_addr); } - } dev_mgr() diff --git a/src/llama.cpp b/src/llama.cpp index 21cd489e8..6b11a9adc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4368,8 +4368,6 @@ struct llama_model_loader { int n_created = 0; // For tensor parallelism int world_size = 1; - int rank = 0; - bool enable_tp = false; int64_t n_elements = 0; size_t n_bytes = 0; @@ -4630,7 +4628,6 @@ struct llama_model_loader { this->use_mmap = use_mmap; this->check_tensors = check_tensors; world_size = ggml_backend_get_world_size(); - rank = ggml_backend_get_rank(); } ~llama_model_loader() { @@ -4859,12 +4856,12 @@ struct llama_model_loader { ggml_set_name(tensor, ggml_get_name(cur)); if (flags == TENSOR_SPLIT_BY_ROW) { tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW; - } - if (flags == TENSOR_SPLIT_BY_COLUMN) { + } else if (flags == TENSOR_SPLIT_BY_COLUMN) { tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN; - } - if (flags == TENSOR_KEEPED_ON_MASTER) { + } else if (flags == TENSOR_KEEPED_ON_MASTER) { tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER; + } else { + tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE; } if (flags == TENSOR_DUPLICATED) { @@ -7023,8 +7020,9 @@ static bool llm_load_tensors( if (n_expert > 0 && hparams.n_expert_used == 0) { throw std::runtime_error("model has expert layers but no expert layers are used"); } - + bool enable_tp = false; if (split_mode == LLAMA_SPLIT_MODE_TENSOR) { + int world_size = ggml_backend_get_world_size(); if (world_size > 1) { enable_tp = true; // need to change the size before load tensor @@ -7078,7 +7076,7 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN); // optional bias tensors - auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN + auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN; layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode); layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode); @@ -7109,7 +7107,7 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW); // optional MLP bias - auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN + auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN; layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode); layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);