add tensor parallel support

Signed-off-by: Chen Xi <xi2chen@intel.com>
This commit is contained in:
Chen Xi 2024-09-26 02:14:04 +00:00
parent cb8507b3b4
commit c9ae1916ec
6 changed files with 44 additions and 28 deletions

View file

@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API int ggml_backend_sycl_rank(void);
GGML_API int ggml_backend_sycl_world_size(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();

View file

@ -599,8 +599,8 @@ extern "C" {
TENSOR_NO_CHANGE,
TENSOR_SPLIT_BY_ROW,
TENSOR_SPLIT_BY_COLUMN,
TENSOR_KEEPED_ON_MASTER,
}
TENSOR_KEEPED_ON_MASTER
};
// n-dimensional tensor
struct ggml_tensor {
@ -637,9 +637,9 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE};
// char padding[4];
char padding[12];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

View file

@ -566,6 +566,17 @@ if (GGML_SYCL)
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
endif()
set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL")
set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include")
set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/")
set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include")
set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/")
include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH})
find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH})
find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH})
# find_package(oneCCL REQUIRED)
message("-- oneCCL found")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH})
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)

View file

@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
int ggml_backend_sycl_rank() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_rank();
return dpct::dev_mgr::instance().get_rank();
}
int ggml_backend_sycl_world_size() {
// use ccl rank as main gpu
return dpct::dev_mgr::instance().get_ccl_world_size();
return dpct::dev_mgr::instance().get_world_size();
}
void ggml_backend_sycl_print_sycl_devices() {
@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
int rank = ggml_backend_sycl_rank()
int world_size = ggml_backend_sycl_world_size()
static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) {
int rank = ggml_backend_sycl_rank();
int world_size = ggml_backend_sycl_world_size();
auto type_traits = ggml_internal_get_type_traits(src->type);
size_t element_size = type_traits.type_size / type_traits.blck_size;
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
memcpy(host_buf, data, size);
} else {
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) {
std::cerr << "split tensor failed!" << std::endl;
}
}
@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
static bool ggml_backend_sycl_buffer_type_initialized = false;
if (!ggml_backend_sycl_buffer_type_initialized) {
if (dpct::dev_mgr::instance().world_size() > 1) {
auto rank = dpct::dev_mgr::instance().get_rank();
if (ggml_backend_sycl_world_size() > 1) {
auto rank = ggml_backend_sycl_rank();
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
queue_ptr stream = &(device_tp.default_queue());
// TODO(xi): buffer_types always use 0 to avoid changes on public code

View file

@ -13,11 +13,13 @@
#ifndef GGML_SYCL_DPCT_HELPER_HPP
#define GGML_SYCL_DPCT_HELPER_HPP
#include <stdlib.h>
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <oneapi/ccl.hpp>
#include <oneapi/mkl.hpp>
#include <map>
#include <mpi.h>
#include "ggml.h"
@ -480,8 +482,6 @@ namespace dpct
int _max_nd_range_size_i[3];
uint32_t _device_id;
std::array<unsigned char, 16> _uuid;
uint32_t _rank;
uint32_t _world_size;
};
static int get_major_version(const sycl::device &dev)
@ -873,8 +873,8 @@ namespace dpct
}
return -1;
}
inline int get_ccl_rank() { return _rank; }
inline int get_ccl_world_size() { return _world_size; }
inline int get_rank() { return _rank; }
inline int get_world_size() { return _world_size; }
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);
@ -1002,7 +1002,13 @@ namespace dpct
return convert_backend_index(backend1) < convert_backend_index(backend2);
}
static void init_ccl() {
static void mpi_finalize() {
static int is_finalized = 0;
MPI_Finalized(&is_finalized);
if (!is_finalized) MPI_Finalize();
}
void init_ccl() {
ccl::init();
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
@ -1018,7 +1024,6 @@ namespace dpct
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
_kvs = ccl::create_kvs(main_addr);
}
}
dev_mgr()

View file

@ -4368,8 +4368,6 @@ struct llama_model_loader {
int n_created = 0;
// For tensor parallelism
int world_size = 1;
int rank = 0;
bool enable_tp = false;
int64_t n_elements = 0;
size_t n_bytes = 0;
@ -4630,7 +4628,6 @@ struct llama_model_loader {
this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
world_size = ggml_backend_get_world_size();
rank = ggml_backend_get_rank();
}
~llama_model_loader() {
@ -4859,12 +4856,12 @@ struct llama_model_loader {
ggml_set_name(tensor, ggml_get_name(cur));
if (flags == TENSOR_SPLIT_BY_ROW) {
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
}
if (flags == TENSOR_SPLIT_BY_COLUMN) {
} else if (flags == TENSOR_SPLIT_BY_COLUMN) {
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
}
if (flags == TENSOR_KEEPED_ON_MASTER) {
} else if (flags == TENSOR_KEEPED_ON_MASTER) {
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
} else {
tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
}
if (flags == TENSOR_DUPLICATED) {
@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
if (n_expert > 0 && hparams.n_expert_used == 0) {
throw std::runtime_error("model has expert layers but no expert layers are used");
}
bool enable_tp = false;
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
int world_size = ggml_backend_get_world_size();
if (world_size > 1) {
enable_tp = true;
// need to change the size before load tensor
@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
// optional bias tensors
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
// optional MLP bias
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);