add tensor parallel support
Signed-off-by: Chen Xi <xi2chen@intel.com>
This commit is contained in:
parent
cb8507b3b4
commit
c9ae1916ec
6 changed files with 44 additions and 28 deletions
|
@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
|||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_API int ggml_backend_sycl_rank(void);
|
||||
GGML_API int ggml_backend_sycl_world_size(void);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
||||
|
|
|
@ -599,8 +599,8 @@ extern "C" {
|
|||
TENSOR_NO_CHANGE,
|
||||
TENSOR_SPLIT_BY_ROW,
|
||||
TENSOR_SPLIT_BY_COLUMN,
|
||||
TENSOR_KEEPED_ON_MASTER,
|
||||
}
|
||||
TENSOR_KEEPED_ON_MASTER
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
|
@ -637,9 +637,9 @@ extern "C" {
|
|||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
|
||||
enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE};
|
||||
|
||||
// char padding[4];
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
|
|
@ -566,6 +566,17 @@ if (GGML_SYCL)
|
|||
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
|
||||
endif()
|
||||
|
||||
set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL")
|
||||
set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include")
|
||||
set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/")
|
||||
set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include")
|
||||
set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/")
|
||||
include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH})
|
||||
find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH})
|
||||
find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH})
|
||||
# find_package(oneCCL REQUIRED)
|
||||
message("-- oneCCL found")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH})
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
|
|
|
@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|||
|
||||
int ggml_backend_sycl_rank() {
|
||||
// use ccl rank as main gpu
|
||||
return dpct::dev_mgr::instance().get_ccl_rank();
|
||||
return dpct::dev_mgr::instance().get_rank();
|
||||
}
|
||||
|
||||
int ggml_backend_sycl_world_size() {
|
||||
// use ccl rank as main gpu
|
||||
return dpct::dev_mgr::instance().get_ccl_world_size();
|
||||
return dpct::dev_mgr::instance().get_world_size();
|
||||
}
|
||||
|
||||
void ggml_backend_sycl_print_sycl_devices() {
|
||||
|
@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
|
||||
int rank = ggml_backend_sycl_rank()
|
||||
int world_size = ggml_backend_sycl_world_size()
|
||||
static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) {
|
||||
int rank = ggml_backend_sycl_rank();
|
||||
int world_size = ggml_backend_sycl_world_size();
|
||||
auto type_traits = ggml_internal_get_type_traits(src->type);
|
||||
size_t element_size = type_traits.type_size / type_traits.blck_size;
|
||||
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
|
||||
|
@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
|
||||
memcpy(host_buf, data, size);
|
||||
} else {
|
||||
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
|
||||
if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) {
|
||||
std::cerr << "split tensor failed!" << std::endl;
|
||||
}
|
||||
}
|
||||
|
@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||
if (dpct::dev_mgr::instance().world_size() > 1) {
|
||||
auto rank = dpct::dev_mgr::instance().get_rank();
|
||||
if (ggml_backend_sycl_world_size() > 1) {
|
||||
auto rank = ggml_backend_sycl_rank();
|
||||
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
|
||||
queue_ptr stream = &(device_tp.default_queue());
|
||||
// TODO(xi): buffer_types always use 0 to avoid changes on public code
|
||||
|
|
|
@ -13,11 +13,13 @@
|
|||
#ifndef GGML_SYCL_DPCT_HELPER_HPP
|
||||
#define GGML_SYCL_DPCT_HELPER_HPP
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <oneapi/ccl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include <map>
|
||||
#include <mpi.h>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
@ -480,8 +482,6 @@ namespace dpct
|
|||
int _max_nd_range_size_i[3];
|
||||
uint32_t _device_id;
|
||||
std::array<unsigned char, 16> _uuid;
|
||||
uint32_t _rank;
|
||||
uint32_t _world_size;
|
||||
};
|
||||
|
||||
static int get_major_version(const sycl::device &dev)
|
||||
|
@ -873,8 +873,8 @@ namespace dpct
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
inline int get_ccl_rank() { return _rank; }
|
||||
inline int get_ccl_world_size() { return _world_size; }
|
||||
inline int get_rank() { return _rank; }
|
||||
inline int get_world_size() { return _world_size; }
|
||||
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
|
||||
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);
|
||||
|
||||
|
@ -1002,7 +1002,13 @@ namespace dpct
|
|||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||
}
|
||||
|
||||
static void init_ccl() {
|
||||
static void mpi_finalize() {
|
||||
static int is_finalized = 0;
|
||||
MPI_Finalized(&is_finalized);
|
||||
if (!is_finalized) MPI_Finalize();
|
||||
}
|
||||
|
||||
void init_ccl() {
|
||||
ccl::init();
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
|
||||
|
@ -1018,7 +1024,6 @@ namespace dpct
|
|||
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
_kvs = ccl::create_kvs(main_addr);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
dev_mgr()
|
||||
|
|
|
@ -4368,8 +4368,6 @@ struct llama_model_loader {
|
|||
int n_created = 0;
|
||||
// For tensor parallelism
|
||||
int world_size = 1;
|
||||
int rank = 0;
|
||||
bool enable_tp = false;
|
||||
|
||||
int64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
@ -4630,7 +4628,6 @@ struct llama_model_loader {
|
|||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
world_size = ggml_backend_get_world_size();
|
||||
rank = ggml_backend_get_rank();
|
||||
}
|
||||
|
||||
~llama_model_loader() {
|
||||
|
@ -4859,12 +4856,12 @@ struct llama_model_loader {
|
|||
ggml_set_name(tensor, ggml_get_name(cur));
|
||||
if (flags == TENSOR_SPLIT_BY_ROW) {
|
||||
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
|
||||
}
|
||||
if (flags == TENSOR_SPLIT_BY_COLUMN) {
|
||||
} else if (flags == TENSOR_SPLIT_BY_COLUMN) {
|
||||
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
|
||||
}
|
||||
if (flags == TENSOR_KEEPED_ON_MASTER) {
|
||||
} else if (flags == TENSOR_KEEPED_ON_MASTER) {
|
||||
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
|
||||
} else {
|
||||
tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
|
||||
}
|
||||
|
||||
if (flags == TENSOR_DUPLICATED) {
|
||||
|
@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
|
|||
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
||||
throw std::runtime_error("model has expert layers but no expert layers are used");
|
||||
}
|
||||
|
||||
bool enable_tp = false;
|
||||
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
int world_size = ggml_backend_get_world_size();
|
||||
if (world_size > 1) {
|
||||
enable_tp = true;
|
||||
// need to change the size before load tensor
|
||||
|
@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
|
|||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
|
||||
|
||||
// optional bias tensors
|
||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
|
||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
|
||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
|
||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
|
||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
|
||||
|
@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
|
||||
|
||||
// optional MLP bias
|
||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
|
||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
|
||||
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue