Tensor parallelism
This commit is contained in:
parent
971920e935
commit
4f9640b8fe
10 changed files with 598 additions and 411 deletions
|
@ -9,6 +9,7 @@
|
|||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <unordered_set>
|
||||
#include <regex>
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
#include <sys/types.h>
|
||||
|
@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
|
@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
#endif
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
|
@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
|||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
|
|
|
@ -28,6 +28,7 @@ struct gpt_params {
|
|||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
|
|
|
@ -401,6 +401,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
|||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
#endif
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
|
@ -503,6 +505,37 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
|||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
else if (arg == "--tensor-split" || arg == "-ts")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
|
||||
{
|
||||
if (i < split_arg.size())
|
||||
{
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
632
ggml-cuda.cu
632
ggml-cuda.cu
|
@ -23,32 +23,33 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#if CUDART_VERSION >= 12
|
||||
#define CUBLAS_CHECK(err) \
|
||||
do { \
|
||||
cublasStatus_t err_ = (err); \
|
||||
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
||||
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
||||
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
||||
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Q = quantized, F = float, order is src0, src1, dst
|
||||
enum ggml_cuda_op_type {
|
||||
GGML_CUDA_OP_TYPE_QQQ = 0,
|
||||
GGML_CUDA_OP_TYPE_QQF = 1,
|
||||
GGML_CUDA_OP_TYPE_QFQ = 2,
|
||||
GGML_CUDA_OP_TYPE_QFF = 3,
|
||||
GGML_CUDA_OP_TYPE_FQQ = 4,
|
||||
GGML_CUDA_OP_TYPE_FQF = 5,
|
||||
GGML_CUDA_OP_TYPE_FFQ = 6,
|
||||
GGML_CUDA_OP_TYPE_FFF = 7,
|
||||
};
|
||||
#else
|
||||
#define CUBLAS_CHECK(err) \
|
||||
do { \
|
||||
cublasStatus_t err_ = (err); \
|
||||
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
||||
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif // CUDART_VERSION >= 11
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
|
||||
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
||||
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
typedef void (*ggml_cuda_op_t)(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int i1, cudaStream_t & cudaStream_main);
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i0_low, int64_t i0_high, int i1, cudaStream_t & cudaStream_main);
|
||||
|
||||
// QK = number of values after dequantization
|
||||
// QR = QK / number of values before dequantization
|
||||
|
@ -201,8 +202,8 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int
|
|||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
v0 = __half2float(x[ib + 0]);
|
||||
v1 = __half2float(x[ib + 1]);
|
||||
v0 = __half2float(x[ib + iqs + 0]);
|
||||
v1 = __half2float(x[ib + iqs + 1]);
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
|
@ -344,7 +345,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|||
|
||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
|
@ -396,14 +397,16 @@ struct cuda_buffer {
|
|||
size_t size = 0;
|
||||
};
|
||||
|
||||
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
|
||||
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
||||
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
||||
|
||||
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
|
||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||
cuda_buffer& b = g_cuda_buffer_pool[i];
|
||||
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
||||
if (b.size >= size && b.ptr != nullptr) {
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
|
@ -420,9 +423,11 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|||
|
||||
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
|
||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||
cuda_buffer& b = g_cuda_buffer_pool[i];
|
||||
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
||||
if (b.ptr == nullptr) {
|
||||
b.ptr = ptr;
|
||||
b.size = size;
|
||||
|
@ -435,33 +440,79 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|||
|
||||
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
||||
#define GGML_CUDA_MAX_EVENTS 64
|
||||
static cublasHandle_t g_cublasH = nullptr;
|
||||
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
static cudaEvent_t g_cudaEvents_main[GGML_CUDA_MAX_EVENTS] = { nullptr };
|
||||
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
static cudaStream_t g_cudaStreams_memcpy_dst[GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_EVENTS] = { nullptr };
|
||||
|
||||
static int g_device_count = -1;
|
||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
|
||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||
|
||||
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
|
||||
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
||||
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
||||
|
||||
void ggml_init_cublas() {
|
||||
if (g_cublasH == nullptr) {
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
fprintf(stderr, " %d. %s\n", i+1, prop.name);
|
||||
g_tensor_split[i] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
}
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(cudaSetDevice(id));
|
||||
|
||||
// create streams
|
||||
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[i], cudaStreamNonBlocking));
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[i], cudaStreamNonBlocking));
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_dst[i], cudaStreamNonBlocking));
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
||||
}
|
||||
// create events
|
||||
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_main[i], cudaEventDisableTiming));
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[i], cudaEventDisableTiming));
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
// create cublas handle
|
||||
CUBLAS_CHECK(cublasCreate(&g_cublasH));
|
||||
CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
|
||||
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
||||
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
|
||||
}
|
||||
|
||||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_tensor_split(float * tensor_split) {
|
||||
bool all_zero = true;
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
if (tensor_split[i] != 0.0f) {
|
||||
all_zero = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_zero) {
|
||||
return;
|
||||
}
|
||||
float split_sum = 0.0f;
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_tensor_split[i] = split_sum;
|
||||
split_sum += tensor_split[i];
|
||||
}
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_tensor_split[i] /= split_sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -485,26 +536,29 @@ void ggml_cuda_host_free(void * ptr) {
|
|||
CUDA_CHECK(cudaFreeHost(ptr));
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
|
||||
const uint64_t ne0 = src->ne[0];
|
||||
const uint64_t ne1 = src->ne[1];
|
||||
const uint64_t nb0 = src->nb[0];
|
||||
const uint64_t nb1 = src->nb[1];
|
||||
const uint64_t nb2 = src->nb[2];
|
||||
const uint64_t nb3 = src->nb[3];
|
||||
const enum ggml_type type = src->type;
|
||||
const size_t ts = ggml_type_size(type);
|
||||
const size_t bs = ggml_blck_size(type);
|
||||
static cudaError_t ggml_cuda_h2d_tensor_2d(
|
||||
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
||||
|
||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
||||
char * dst_char = (char *) dst;
|
||||
const int64_t ne0 = src->ne[0];
|
||||
const int64_t nb0 = src->nb[0];
|
||||
const int64_t nb1 = src->nb[1];
|
||||
const int64_t nb2 = src->nb[2];
|
||||
const int64_t nb3 = src->nb[3];
|
||||
const enum ggml_type type = src->type;
|
||||
const int64_t ts = ggml_type_size(type);
|
||||
const int64_t bs = ggml_blck_size(type);
|
||||
int64_t i1_diff = i1_high - i1_low;
|
||||
|
||||
const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
|
||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||
return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
|
||||
return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
|
||||
} else if (nb0 == ts) {
|
||||
return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
|
||||
return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
||||
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
||||
void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
|
||||
void * rd = (void *) (dst_char + i1*ts*ne0/bs);
|
||||
// pretend the row is a matrix with cols=1
|
||||
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
|
||||
if (r != cudaSuccess) return r;
|
||||
|
@ -513,114 +567,21 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
const int nb10 = src1->nb[0];
|
||||
const int nb11 = src1->nb[1];
|
||||
const int nb12 = src1->nb[2];
|
||||
const int nb13 = src1->nb[3];
|
||||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
const int n_mm = ne03 * ne02;
|
||||
|
||||
size_t x_size, y_size, d_size;
|
||||
half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
|
||||
half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
|
||||
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
|
||||
|
||||
bool src1_cont_rows = nb10 == sizeof(float);
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
int i = i03*ne02 + i02;
|
||||
cudaStream_t cudaStream = g_cudaStreams_main[i % GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
half * c_X = d_X + i * x_ne;
|
||||
half * c_Y = d_Y + i * y_ne;
|
||||
float * c_D = d_D + i * d_ne;
|
||||
|
||||
// copy src0 to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
|
||||
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
}
|
||||
else {
|
||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne10; i00++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
|
||||
|
||||
// compute
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha, c_X, CUDA_R_16F, ne00,
|
||||
c_Y, CUDA_R_16F, ne10,
|
||||
&beta, c_D, CUDA_R_32F, ne01,
|
||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||
CUBLAS_GEMM_DEFAULT));
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
|
||||
}
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_Y, y_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_mul(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int i1, cudaStream_t & cudaStream_main){
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i0_low, int64_t i0_high, int i1,
|
||||
cudaStream_t & cudaStream_main){
|
||||
|
||||
GGML_ASSERT(src0_ddf_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
for (int64_t i01 = i0_low; i01 < i0_high; i01++) {
|
||||
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
|
||||
|
||||
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
|
||||
|
@ -638,33 +599,34 @@ inline void ggml_cuda_op_mul(
|
|||
|
||||
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int i1, cudaStream_t & cudaStream_main){
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i0_low, int64_t i0_high, int i1,
|
||||
cudaStream_t & cudaStream_main){
|
||||
|
||||
GGML_ASSERT(src0_ddq_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t nrows = i0_high - i0_low;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, ne01, cudaStream_main);
|
||||
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
@ -680,7 +642,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|||
|
||||
inline void ggml_cuda_op_mul_mat_cublas(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int i1, cudaStream_t & cudaStream_main){
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i0_low, int64_t i0_high, int i1,
|
||||
cudaStream_t & cudaStream_main){
|
||||
|
||||
GGML_ASSERT(src0_ddf_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
|
@ -690,30 +653,35 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||
const float beta = 0.0f;
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream_main));
|
||||
const int64_t i0_diff = i0_high - i0_low;
|
||||
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
i0_diff, ne11, ne10,
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf_i, ne10,
|
||||
&beta, dst_ddf_i, ne01));
|
||||
&beta, dst_ddf_i, i0_diff));
|
||||
|
||||
(void) dst;
|
||||
(void) src0_ddq_i;
|
||||
(void) i1;
|
||||
}
|
||||
|
||||
template<enum ggml_cuda_op_type op_type, ggml_cuda_op_t op>
|
||||
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
ggml_cuda_op_t op, bool src0_needs_f32) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t nrows0 = ggml_nrows(src0);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
@ -726,69 +694,100 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
// strides for iteration over dims 3 and 2
|
||||
const int64_t src0_stride = ne00 * ne01;
|
||||
const int64_t src1_stride = ne10 * ne11;
|
||||
const int64_t dst_stride = ne0 * ne1;
|
||||
const int64_t num_iters = ne02 * ne03;
|
||||
|
||||
const size_t src0_ts = ggml_type_size(src0->type);
|
||||
const size_t src0_bs = ggml_blck_size(src0->type);
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_CUDA;
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
// indices of the devices on which the input data is stored
|
||||
int src0_id = src0_extra == nullptr ? -1 : src0_extra->i_device;
|
||||
int src1_id = src1_extra == nullptr ? -1 : src1_extra->i_device;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
||||
const bool src0_needs_f32 = op_type & 0x4; // 3rd least significant bit = src0 needs f32
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_CUDA;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU || src1->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_CUDA;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU || dst->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
||||
|
||||
// dd = data device
|
||||
char * src0_ddq = nullptr; // quantized
|
||||
float * src0_ddf = nullptr; // float
|
||||
float * src1_ddf = nullptr;
|
||||
float * dst_ddf = nullptr;
|
||||
|
||||
bool src0_ddq_malloced = false;
|
||||
bool src0_ddf_malloced = false;
|
||||
bool src1_ddf_malloced = false;
|
||||
bool dst_ddf_malloced = false;
|
||||
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
|
||||
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
||||
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||
|
||||
// asq = actual size quantized, asf = actual size float
|
||||
size_t src0_asq, src0_asf, src1_asf, dst_asf;
|
||||
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
// if data is on one device (!= -1) but not this one, continue
|
||||
if (src0_id != -1 && src0_id != id) {
|
||||
continue;
|
||||
}
|
||||
if (src1_id != -1 && src1_id != id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool split = src0_id == -1 && src1_id == -1;
|
||||
int64_t row_low, row_high;
|
||||
if (split) {
|
||||
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
||||
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
||||
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
||||
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
||||
} else {
|
||||
row_low = 0;
|
||||
row_high = ne01;
|
||||
}
|
||||
if (row_low == row_high) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t row_diff = row_high - row_low;
|
||||
|
||||
cudaSetDevice(id);
|
||||
|
||||
if (src0_on_device) {
|
||||
if (src0_is_f32) {
|
||||
src0_ddf = (float *) src0->data;
|
||||
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
||||
} else {
|
||||
src0_ddq = (char *) src0->data;
|
||||
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
||||
}
|
||||
} else {
|
||||
if (src0_is_f32) {
|
||||
src0_ddf = (float *) ggml_cuda_pool_malloc(num_iters * src0_stride * sizeof(float), &src0_asf);
|
||||
src0_ddf_malloced = true;
|
||||
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
||||
} else {
|
||||
src0_ddq = (char *) ggml_cuda_pool_malloc(num_iters * src0_stride * src0_ts, &src0_asq);
|
||||
src0_ddq_malloced = true;
|
||||
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
||||
}
|
||||
}
|
||||
|
||||
if (src0_needs_f32 && !src0_is_f32) {
|
||||
src0_ddf = (float *) ggml_cuda_pool_malloc(num_iters * src0_stride * sizeof(float), &src0_asf);
|
||||
src0_ddf_malloced = true;
|
||||
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
|
||||
}
|
||||
|
||||
if (src1_on_device) {
|
||||
src1_ddf = (float *) src1->data;
|
||||
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
||||
} else {
|
||||
src1_ddf = (float *) ggml_cuda_pool_malloc(num_iters * src1_stride * sizeof(float), &src1_asf);
|
||||
src1_ddf_malloced = true;
|
||||
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
|
||||
}
|
||||
if (dst_on_device) {
|
||||
dst_ddf = (float *) dst->data;
|
||||
dst_ddf[id] = (float *) dst_extra->data_device[id];
|
||||
} else {
|
||||
dst_ddf = (float *) ggml_cuda_pool_malloc(num_iters * dst_stride * sizeof(float), &dst_asf);
|
||||
dst_ddf_malloced = true;
|
||||
size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
|
||||
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
||||
}
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
|
@ -797,34 +796,64 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
const int64_t i12 = i02 % ne12;
|
||||
|
||||
const int64_t i0 = i03*ne02 + i02;
|
||||
const int64_t i0_offset_low = row_low/ne01;
|
||||
const int64_t i0_offset_high = row_high/ne01;
|
||||
|
||||
int64_t i01_low = 0;
|
||||
int64_t i01_high = ne01;
|
||||
if (split) {
|
||||
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
||||
continue;
|
||||
}
|
||||
if (i0 == i0_offset_low) {
|
||||
i01_low = row_low % ne01;
|
||||
}
|
||||
if (i0 == i0_offset_high) {
|
||||
i01_high = row_high % ne01;
|
||||
}
|
||||
}
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
if (i01_diff == 0) {
|
||||
continue;
|
||||
}
|
||||
const int64_t i1 = i13*ne12 + i12;
|
||||
|
||||
cudaStream_t cudaStream_main = g_cudaStreams_main[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaStream_t cudaStream_memcpy_dst = g_cudaStreams_memcpy_dst[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaEvent_t cudaEvent_main = g_cudaEvents_main[i0 % GGML_CUDA_MAX_EVENTS];
|
||||
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[i0 % GGML_CUDA_MAX_EVENTS];
|
||||
cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
||||
|
||||
char * src0_ddq_i = src0_ddq + i0*src0_stride;
|
||||
float * src0_ddf_i = src0_ddf + i0*src0_stride;
|
||||
float * src1_ddf_i = src1_ddf + i1*src1_stride;
|
||||
float * dst_ddf_i = dst_ddf + i0*dst_stride;
|
||||
// for split tensors the data begins at i0 == i0_offset_low
|
||||
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
||||
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
||||
float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
|
||||
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
||||
|
||||
// for split tensors the data pointer needs to be rounded down
|
||||
// to the bin edge for i03, i02 bins beyond the first
|
||||
if (i0 - i0_offset_low > 0) {
|
||||
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
||||
src0_ddf_i -= (row_low % ne01)*ne00;
|
||||
}
|
||||
if (i0 - i0_offset_low > 0) {
|
||||
dst_ddf_i -= (row_low % ne0)*ne1;
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (!src1_on_device) { // src1 first to avoid blocking device queues
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf, src1, i03, i02, cudaStream_memcpy_src1));
|
||||
if (!src1_on_device) {
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
||||
if (!src0_on_device) {
|
||||
if (src0_is_f32) {
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf, src0, i03, i02, cudaStream_main));
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
||||
} else {
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq, src0, i03, i02, cudaStream_main));
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
||||
}
|
||||
}
|
||||
|
||||
// convert src0 to f32 if it's necessary for the ggml_cuda_op
|
||||
if (src0_needs_f32 && !src0_is_f32) {
|
||||
to_fp32_cuda(src0_ddq_i, src0_ddf_i, src0_stride, cudaStream_main);
|
||||
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
@ -832,46 +861,63 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
||||
|
||||
// do the computation
|
||||
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i1, cudaStream_main);
|
||||
|
||||
CUDA_CHECK(cudaEventRecord(cudaEvent_main, cudaStream_main));
|
||||
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i01_low, i01_high, i1, cudaStream_main);
|
||||
|
||||
// copy dst to host if necessary
|
||||
if (!dst_on_device) {
|
||||
// wait with memcpy until main stream is done
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_memcpy_dst, cudaEvent_main, 0));
|
||||
|
||||
if (split) {
|
||||
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
||||
// dst is NOT transposed.
|
||||
// The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
||||
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
||||
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
||||
for (int64_t j = 0; j < ne1; ++j) {
|
||||
float * dhf_dst_i = (float *) ((char *) dst->data + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
|
||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float),
|
||||
cudaMemcpyDeviceToHost, cudaStream_main));
|
||||
}
|
||||
} else {
|
||||
float * dhf_dst_i = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), cudaMemcpyDeviceToHost, cudaStream_memcpy_dst));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), cudaMemcpyDeviceToHost, cudaStream_main));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// wait until each device is finished, then free their buffers
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(cudaSetDevice(id));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
if (src0_ddf_malloced) {
|
||||
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
||||
if (src0_asq[id] > 0) {
|
||||
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
||||
}
|
||||
if (src0_ddq_malloced) {
|
||||
ggml_cuda_pool_free(src0_ddq, src0_asq);
|
||||
if (src0_asf[id] > 0) {
|
||||
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
||||
}
|
||||
if (src1_ddf_malloced) {
|
||||
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
||||
if (src1_asf[id] > 0) {
|
||||
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
||||
}
|
||||
if (dst_asf[id] > 0) {
|
||||
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
||||
}
|
||||
if (dst_ddf_malloced) {
|
||||
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
return src1->backend == GGML_BACKEND_CUDA;
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
(void) src0;
|
||||
(void) dst;
|
||||
return src1->backend == GGML_BACKEND_GPU;
|
||||
}
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_FFF, ggml_cuda_op_mul>(src0, src1, dst);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
@ -881,81 +927,80 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
size_t src0_sz = ggml_nbytes(src0);
|
||||
size_t src1_sz = ggml_nbytes(src1);
|
||||
|
||||
// mul_mat_q: src0 is converted to fp32 on device
|
||||
size_t mul_mat_q_transfer = src0_sz + src1_sz;
|
||||
|
||||
// mul_mat_f16: src1 is converted to fp16 on cpu
|
||||
size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
|
||||
|
||||
// choose the smaller one to transfer to the device
|
||||
// TODO: this is not always the best choice due to the overhead of converting to fp16
|
||||
return mul_mat_f16_transfer < mul_mat_q_transfer;
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_FFF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
|
||||
}
|
||||
else if (src0->type == GGML_TYPE_F16) {
|
||||
if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
|
||||
// ggml_cuda_op<GGML_CUDA_OP_TYPE_QQF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
|
||||
ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
|
||||
}
|
||||
else {
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1) {
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_QFF, ggml_cuda_op_dequantize_mul_mat_vec>(src0, src1, dst);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
} else {
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_FFF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ggml_is_quantized(src0->type)) {
|
||||
if (src1->ne[1] == 1) {
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_QFF, ggml_cuda_op_dequantize_mul_mat_vec>(src0, src1, dst);
|
||||
} else {
|
||||
ggml_cuda_op<GGML_CUDA_OP_TYPE_FFF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
|
||||
}
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
|
||||
return ggml_nelements(src1) * sizeof(ggml_fp16_t);
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset, int n_layer) {
|
||||
FILE * fp = fopen(fname, "rb");
|
||||
int nrows = ggml_nrows(tensor);
|
||||
const size_t nb1 = tensor->nb[1];
|
||||
ggml_backend backend = tensor->backend;
|
||||
struct ggml_tensor_extra_gpu * extra = (struct ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
extra->data_device[id] = nullptr;
|
||||
|
||||
int layer_low = id == 0 ? 0 : n_layer*g_tensor_split[id];
|
||||
int layer_high = id == g_device_count - 1 ? n_layer : n_layer*g_tensor_split[id + 1];
|
||||
if (backend == GGML_BACKEND_GPU && (extra->layer < layer_low || extra->layer >= layer_high)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cudaSetDevice(id);
|
||||
|
||||
int row_low, row_high;
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
extra->i_device = id;
|
||||
|
||||
row_low = 0;
|
||||
row_high = nrows;
|
||||
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
extra->i_device = -1;
|
||||
|
||||
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
||||
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
||||
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
||||
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
if (row_low == row_high) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
const size_t offset_split = offset + row_low*nb1;
|
||||
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
|
||||
void * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
void * buf_host = malloc(size);
|
||||
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
||||
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
|
||||
#else
|
||||
int ret = fseek(fp, (long) offset, SEEK_SET);
|
||||
int ret = fseek(fp, (long) offset_split, SEEK_SET);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
|
||||
|
@ -968,38 +1013,59 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|||
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
tensor->data = buf;
|
||||
free(buf_host);
|
||||
extra->data_device[id] = buf;
|
||||
}
|
||||
|
||||
tensor->extra = extra;
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
if (extra->data_device[id] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaSetDevice(id));
|
||||
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
||||
}
|
||||
|
||||
delete extra;
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||
ggml_cuda_func_t func;
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_MUL:
|
||||
if (!ggml_cuda_can_mul(tensor->src0, tensor->src1, tensor)) {
|
||||
return false;
|
||||
}
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
ggml_cuda_mul(tensor->src0, tensor->src1, tensor);
|
||||
return true;
|
||||
func = ggml_cuda_mul;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_mul_mat;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
ggml_cuda_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
|
||||
func(tensor->src0, tensor->src1, tensor);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
14
ggml-cuda.h
14
ggml-cuda.h
|
@ -1,10 +1,21 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
struct ggml_tensor_extra_gpu {
|
||||
int layer; // which layer the tensor is on
|
||||
int i_device; // which device the data is on
|
||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||
};
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
void ggml_cuda_set_tensor_split(float * tensor_split);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
|
@ -15,7 +26,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|||
void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset, int n_layer);
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -676,7 +676,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|||
}
|
||||
|
||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_CL);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
@ -789,7 +789,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
|
@ -800,7 +800,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy data to device
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||
|
@ -829,7 +829,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
|
@ -865,7 +865,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
|
@ -879,7 +879,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
|
||||
|
@ -936,7 +936,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_CL) {
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
|
@ -992,7 +992,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
} else if (src0->backend == GGML_BACKEND_CL) {
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->data;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
|
@ -1077,7 +1077,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1156,7 +1156,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|||
CL_CHECK(clFinish(queue));
|
||||
|
||||
tensor->data = dst;
|
||||
tensor->backend = GGML_BACKEND_CL;
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
}
|
||||
|
||||
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||
|
|
14
ggml.c
14
ggml.c
|
@ -3722,6 +3722,12 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||
}
|
||||
|
||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
||||
}
|
||||
|
||||
int ggml_blck_size(enum ggml_type type) {
|
||||
return GGML_BLCK_SIZE[type];
|
||||
}
|
||||
|
@ -4144,6 +4150,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|||
/*.perf_time_us =*/ 0,
|
||||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||
/*.name =*/ { 0 },
|
||||
/*.extra =*/ NULL,
|
||||
/*.pad =*/ { 0 },
|
||||
};
|
||||
|
||||
|
@ -8147,7 +8154,7 @@ static void ggml_compute_forward_mul_f32(
|
|||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_CL) {
|
||||
if (src1->backend == GGML_BACKEND_GPU) {
|
||||
if (ith == 0) {
|
||||
ggml_cl_mul(src0, src1, dst);
|
||||
}
|
||||
|
@ -12884,8 +12891,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
GGML_ASSERT(params);
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
bool used_cuda = ggml_cuda_compute_forward(params, tensor);
|
||||
if (used_cuda) {
|
||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
@ -14196,7 +14203,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
|
|
10
ggml.h
10
ggml.h
|
@ -249,8 +249,8 @@ extern "C" {
|
|||
|
||||
enum ggml_backend {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_CUDA = 1,
|
||||
GGML_BACKEND_CL = 2,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
GGML_BACKEND_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
|
@ -375,7 +375,9 @@ extern "C" {
|
|||
|
||||
char name[GGML_MAX_NAME];
|
||||
|
||||
char padding[16];
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
@ -424,6 +426,7 @@ extern "C" {
|
|||
struct ggml_compute_params {
|
||||
enum ggml_task_type type;
|
||||
|
||||
// ith = thread index, nth = number of threads
|
||||
int ith, nth;
|
||||
|
||||
// work buffer for all threads
|
||||
|
@ -445,6 +448,7 @@ extern "C" {
|
|||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
|
|
85
llama.cpp
85
llama.cpp
|
@ -199,6 +199,12 @@ struct llama_model {
|
|||
if (ctx) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -665,7 +671,7 @@ struct llama_model_loader {
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, int layer, ggml_backend backend) {
|
||||
auto it = tensors_map.name_to_idx.find(name);
|
||||
if (it == tensors_map.name_to_idx.end()) {
|
||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||
|
@ -676,10 +682,10 @@ struct llama_model_loader {
|
|||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||
}
|
||||
|
||||
return get_tensor_for(lt, backend);
|
||||
return get_tensor_for(lt, layer, backend);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, int layer, ggml_backend backend) {
|
||||
struct ggml_tensor * tensor;
|
||||
if (lt.ne.size() == 2) {
|
||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||
|
@ -689,6 +695,17 @@ struct llama_model_loader {
|
|||
}
|
||||
ggml_set_name(tensor, lt.name.c_str());
|
||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (backend == GGML_BACKEND_GPU || backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||
extra->layer = layer;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
#else
|
||||
(void) layer;
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
tensor->backend = backend;
|
||||
lt.ggml_tensor = tensor;
|
||||
num_ggml_tensors_created++;
|
||||
|
@ -842,6 +859,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.tensor_split =*/ {0},
|
||||
/*.seed =*/ -1,
|
||||
/*.f16_kv =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
|
@ -926,6 +944,7 @@ static void llama_model_load_internal(
|
|||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_gpu_layers,
|
||||
float * tensor_split,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
|
@ -1019,13 +1038,16 @@ static void llama_model_load_internal(
|
|||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
|
||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||
#else
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
||||
#endif
|
||||
|
||||
// prepare memory for the weights
|
||||
|
@ -1037,45 +1059,46 @@ static void llama_model_load_internal(
|
|||
|
||||
ml->ggml_ctx = ctx;
|
||||
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, -1, GGML_BACKEND_CPU);
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, -1, GGML_BACKEND_CPU);
|
||||
|
||||
// "output" tensor
|
||||
{
|
||||
ggml_backend backend_output;
|
||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD;
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
} else {
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, -1, backend_output);
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
std::string layers_i = "layers." + std::to_string(i);
|
||||
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, i, backend);
|
||||
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, i, backend_split);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, i, backend_split);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, i, backend_split);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, i, backend_split);
|
||||
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, i, backend);
|
||||
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, i, backend_split);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, i, backend_split);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, i, backend_split);
|
||||
|
||||
if (backend == LLAMA_BACKEND_OFFLOAD) {
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_total +=
|
||||
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||
|
@ -1127,6 +1150,8 @@ static void llama_model_load_internal(
|
|||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
{
|
||||
ggml_cuda_set_tensor_split(tensor_split);
|
||||
|
||||
size_t done_size = 0;
|
||||
size_t data_size = 0;
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
|
@ -1136,13 +1161,14 @@ static void llama_model_load_internal(
|
|||
}
|
||||
}
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
||||
ggml_backend backend = lt.ggml_tensor->backend;
|
||||
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off, hparams.n_layer);
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
|
@ -1157,7 +1183,7 @@ static void llama_model_load_internal(
|
|||
}
|
||||
}
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CL) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
|
@ -1167,6 +1193,8 @@ static void llama_model_load_internal(
|
|||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void) tensor_split;
|
||||
#endif
|
||||
|
||||
if (progress_callback) {
|
||||
|
@ -1185,6 +1213,7 @@ static bool llama_model_load(
|
|||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_gpu_layers,
|
||||
float * tensor_split,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
|
@ -1192,8 +1221,8 @@ static bool llama_model_load(
|
|||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
||||
vocab_only, progress_callback, progress_callback_user_data);
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, tensor_split, memory_type, use_mmap,
|
||||
use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::string & err) {
|
||||
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
||||
|
@ -2293,8 +2322,8 @@ struct llama_context * llama_init_from_file(
|
|||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
||||
params.use_mmap, params.use_mlock, params.vocab_only,
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, params.tensor_split,
|
||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
||||
params.progress_callback, params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
llama_free(ctx);
|
||||
|
@ -2547,7 +2576,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||
}
|
||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, -1, GGML_BACKEND_CPU);
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
model_loader->load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
|
|
8
llama.h
8
llama.h
|
@ -1,6 +1,13 @@
|
|||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
@ -67,6 +74,7 @@ extern "C" {
|
|||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_gpu_layers; // number of layers to store in VRAM
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
int seed; // RNG seed, -1 for random
|
||||
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue