add tensor parallel support
Signed-off-by: Chen Xi <xi2chen@intel.com>
This commit is contained in:
parent
cb8507b3b4
commit
c9ae1916ec
6 changed files with 44 additions and 28 deletions
|
@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
|
GGML_API int ggml_backend_sycl_rank(void);
|
||||||
|
GGML_API int ggml_backend_sycl_world_size(void);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
||||||
|
|
|
@ -599,8 +599,8 @@ extern "C" {
|
||||||
TENSOR_NO_CHANGE,
|
TENSOR_NO_CHANGE,
|
||||||
TENSOR_SPLIT_BY_ROW,
|
TENSOR_SPLIT_BY_ROW,
|
||||||
TENSOR_SPLIT_BY_COLUMN,
|
TENSOR_SPLIT_BY_COLUMN,
|
||||||
TENSOR_KEEPED_ON_MASTER,
|
TENSOR_KEEPED_ON_MASTER
|
||||||
}
|
};
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
|
@ -637,9 +637,9 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
|
enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE};
|
||||||
|
|
||||||
// char padding[4];
|
char padding[12];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
|
@ -566,6 +566,17 @@ if (GGML_SYCL)
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL")
|
||||||
|
set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include")
|
||||||
|
set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/")
|
||||||
|
set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include")
|
||||||
|
set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/")
|
||||||
|
include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH})
|
||||||
|
find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH})
|
||||||
|
find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH})
|
||||||
|
# find_package(oneCCL REQUIRED)
|
||||||
|
message("-- oneCCL found")
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH})
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
find_package(MKL REQUIRED)
|
find_package(MKL REQUIRED)
|
||||||
|
|
|
@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
||||||
|
|
||||||
int ggml_backend_sycl_rank() {
|
int ggml_backend_sycl_rank() {
|
||||||
// use ccl rank as main gpu
|
// use ccl rank as main gpu
|
||||||
return dpct::dev_mgr::instance().get_ccl_rank();
|
return dpct::dev_mgr::instance().get_rank();
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_sycl_world_size() {
|
int ggml_backend_sycl_world_size() {
|
||||||
// use ccl rank as main gpu
|
// use ccl rank as main gpu
|
||||||
return dpct::dev_mgr::instance().get_ccl_world_size();
|
return dpct::dev_mgr::instance().get_world_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sycl_print_sycl_devices() {
|
void ggml_backend_sycl_print_sycl_devices() {
|
||||||
|
@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) {
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
|
static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) {
|
||||||
int rank = ggml_backend_sycl_rank()
|
int rank = ggml_backend_sycl_rank();
|
||||||
int world_size = ggml_backend_sycl_world_size()
|
int world_size = ggml_backend_sycl_world_size();
|
||||||
auto type_traits = ggml_internal_get_type_traits(src->type);
|
auto type_traits = ggml_internal_get_type_traits(src->type);
|
||||||
size_t element_size = type_traits.type_size / type_traits.blck_size;
|
size_t element_size = type_traits.type_size / type_traits.blck_size;
|
||||||
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
|
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
|
||||||
|
@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
|
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
|
||||||
memcpy(host_buf, data, size);
|
memcpy(host_buf, data, size);
|
||||||
} else {
|
} else {
|
||||||
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
|
if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) {
|
||||||
std::cerr << "split tensor failed!" << std::endl;
|
std::cerr << "split tensor failed!" << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
|
||||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||||
if (dpct::dev_mgr::instance().world_size() > 1) {
|
if (ggml_backend_sycl_world_size() > 1) {
|
||||||
auto rank = dpct::dev_mgr::instance().get_rank();
|
auto rank = ggml_backend_sycl_rank();
|
||||||
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
|
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
|
||||||
queue_ptr stream = &(device_tp.default_queue());
|
queue_ptr stream = &(device_tp.default_queue());
|
||||||
// TODO(xi): buffer_types always use 0 to avoid changes on public code
|
// TODO(xi): buffer_types always use 0 to avoid changes on public code
|
||||||
|
|
|
@ -13,11 +13,13 @@
|
||||||
#ifndef GGML_SYCL_DPCT_HELPER_HPP
|
#ifndef GGML_SYCL_DPCT_HELPER_HPP
|
||||||
#define GGML_SYCL_DPCT_HELPER_HPP
|
#define GGML_SYCL_DPCT_HELPER_HPP
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
#include <sycl/sycl.hpp>
|
#include <sycl/sycl.hpp>
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
#include <oneapi/ccl.hpp>
|
#include <oneapi/ccl.hpp>
|
||||||
#include <oneapi/mkl.hpp>
|
#include <oneapi/mkl.hpp>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
@ -480,8 +482,6 @@ namespace dpct
|
||||||
int _max_nd_range_size_i[3];
|
int _max_nd_range_size_i[3];
|
||||||
uint32_t _device_id;
|
uint32_t _device_id;
|
||||||
std::array<unsigned char, 16> _uuid;
|
std::array<unsigned char, 16> _uuid;
|
||||||
uint32_t _rank;
|
|
||||||
uint32_t _world_size;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_major_version(const sycl::device &dev)
|
static int get_major_version(const sycl::device &dev)
|
||||||
|
@ -873,8 +873,8 @@ namespace dpct
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
inline int get_ccl_rank() { return _rank; }
|
inline int get_rank() { return _rank; }
|
||||||
inline int get_ccl_world_size() { return _world_size; }
|
inline int get_world_size() { return _world_size; }
|
||||||
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
|
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
|
||||||
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);
|
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);
|
||||||
|
|
||||||
|
@ -1002,7 +1002,13 @@ namespace dpct
|
||||||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_ccl() {
|
static void mpi_finalize() {
|
||||||
|
static int is_finalized = 0;
|
||||||
|
MPI_Finalized(&is_finalized);
|
||||||
|
if (!is_finalized) MPI_Finalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_ccl() {
|
||||||
ccl::init();
|
ccl::init();
|
||||||
MPI_Init(NULL, NULL);
|
MPI_Init(NULL, NULL);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
|
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
|
||||||
|
@ -1018,7 +1024,6 @@ namespace dpct
|
||||||
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
|
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||||
_kvs = ccl::create_kvs(main_addr);
|
_kvs = ccl::create_kvs(main_addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dev_mgr()
|
dev_mgr()
|
||||||
|
|
|
@ -4368,8 +4368,6 @@ struct llama_model_loader {
|
||||||
int n_created = 0;
|
int n_created = 0;
|
||||||
// For tensor parallelism
|
// For tensor parallelism
|
||||||
int world_size = 1;
|
int world_size = 1;
|
||||||
int rank = 0;
|
|
||||||
bool enable_tp = false;
|
|
||||||
|
|
||||||
int64_t n_elements = 0;
|
int64_t n_elements = 0;
|
||||||
size_t n_bytes = 0;
|
size_t n_bytes = 0;
|
||||||
|
@ -4630,7 +4628,6 @@ struct llama_model_loader {
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
world_size = ggml_backend_get_world_size();
|
world_size = ggml_backend_get_world_size();
|
||||||
rank = ggml_backend_get_rank();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_model_loader() {
|
~llama_model_loader() {
|
||||||
|
@ -4859,12 +4856,12 @@ struct llama_model_loader {
|
||||||
ggml_set_name(tensor, ggml_get_name(cur));
|
ggml_set_name(tensor, ggml_get_name(cur));
|
||||||
if (flags == TENSOR_SPLIT_BY_ROW) {
|
if (flags == TENSOR_SPLIT_BY_ROW) {
|
||||||
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
|
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
|
||||||
}
|
} else if (flags == TENSOR_SPLIT_BY_COLUMN) {
|
||||||
if (flags == TENSOR_SPLIT_BY_COLUMN) {
|
|
||||||
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
|
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
|
||||||
}
|
} else if (flags == TENSOR_KEEPED_ON_MASTER) {
|
||||||
if (flags == TENSOR_KEEPED_ON_MASTER) {
|
|
||||||
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
|
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
|
||||||
|
} else {
|
||||||
|
tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags == TENSOR_DUPLICATED) {
|
if (flags == TENSOR_DUPLICATED) {
|
||||||
|
@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
|
||||||
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
||||||
throw std::runtime_error("model has expert layers but no expert layers are used");
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
||||||
}
|
}
|
||||||
|
bool enable_tp = false;
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||||
|
int world_size = ggml_backend_get_world_size();
|
||||||
if (world_size > 1) {
|
if (world_size > 1) {
|
||||||
enable_tp = true;
|
enable_tp = true;
|
||||||
// need to change the size before load tensor
|
// need to change the size before load tensor
|
||||||
|
@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
|
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
|
||||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
|
||||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
|
||||||
|
@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
|
||||||
|
|
||||||
// optional MLP bias
|
// optional MLP bias
|
||||||
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
|
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
|
||||||
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
|
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
|
||||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue